This file is designed to analyze coronavirus data from a single state using three data sources:
Code for processing data from each of these sources is available in:
The goal of this file is to download updated data for the three main data sources, and to explore the performance of the segments as measured against the new data.
Functions are sourced and a variable mapping file is created:
# All functions assume that tidyverse and its components are loaded and available
# Other functions are declared in the sourcing files or use library::function()
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.4 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# If the same function is in both files, use the version from the more specific source
source("./Coronavirus_Statistics_Functions_Shared_v003.R")
source("./Coronavirus_Statistics_Functions_CTP_v003.R")
source("./Coronavirus_Statistics_Functions_USAF_v003.R")
source("./Coronavirus_Statistics_Functions_CDC_v003.R")
# Create a variable mapping file
varMapper <- c("cases"="Cases",
"newCases"="Increase in cases, most recent 30 days",
"casesroll7"="Rolling 7-day mean cases",
"deaths"="Deaths",
"newDeaths"="Increase in deaths, most recent 30 days",
"deathsroll7"="Rolling 7-day mean deaths",
"cpm"="Cases per million",
"cpm7"="Cases per day (7-day rolling mean) per million",
"newcpm"="Increase in cases, most recent 30 days, per million",
"dpm"="Deaths per million",
"dpm7"="Deaths per day (7-day rolling mean) per million",
"newdpm"="Increase in deaths, most recent 30 days, per million",
"hpm7"="Currently Hospitalized per million (7-day rolling mean)",
"tpm"="Tests per million",
"tpm7"="Tests per million per day (7-day rolling mean)",
"cdcExcess"="Excess all-cause (CDC)",
"ctp_death7"="COVID Tracking Project",
"usaf_death7"="USA Facts",
"CDC_deaths"="CDC total deaths",
"CDC_excess"="CDC excess deaths",
"CTP_cases"="COVID Tracking Project cases",
"CTP_deaths"="COVID Tracking Project deaths",
"CTP_hosp"="COVID Tracking Project hospitalized",
"CTP_tests"="COVID Tracking Project tests",
"USAF_cases"="USA Facts cases",
"USAF_deaths"="USA Facts deaths",
"vpm7"="Per million people (7-day rolling daily average)",
"vpm"="Per million people"
)
New data from COVID Tracking Project are downloaded and assessed against the existing state-level segments:
# Use existing segments with updated data
locDownload <- "./RInputFiles/Coronavirus/CV_downloaded_201108.csv"
test_old_201108 <- readRunCOVIDTrackingProject(thruLabel="Nov 7, 2020",
downloadTo=if (file.exists(locDownload)) NULL else locDownload,
readFrom=locDownload,
compareFile=readFromRDS("test_hier5_201025")$dfRaw,
useClusters=readFromRDS("test_hier5_201025")$useClusters
)
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## state = col_character(),
## totalTestResultsSource = col_character(),
## dataQualityGrade = col_character(),
## lastUpdateEt = col_character(),
## dateModified = col_datetime(format = ""),
## checkTimeEt = col_character(),
## dateChecked = col_datetime(format = ""),
## fips = col_character(),
## hash = col_character(),
## grade = col_logical()
## )
## i Use `spec()` for the full column specifications.
## Warning: 7520 parsing failures.
## row col expected actual file
## 3 dateModified valid date 2020-11-07T24:00:00Z './RInputFiles/Coronavirus/CV_downloaded_201108.csv'
## 3 dateChecked valid date 2020-11-07T24:00:00Z './RInputFiles/Coronavirus/CV_downloaded_201108.csv'
## 4 dateModified valid date 2020-11-01T24:00:00Z './RInputFiles/Coronavirus/CV_downloaded_201108.csv'
## 4 dateChecked valid date 2020-11-01T24:00:00Z './RInputFiles/Coronavirus/CV_downloaded_201108.csv'
## 5 dateModified valid date 2020-11-07T24:00:00Z './RInputFiles/Coronavirus/CV_downloaded_201108.csv'
## ... ............ .......... .................... ....................................................
## See problems(...) for more details.
##
## File is unique by state and date
##
##
## Overall control totals in file:
## # A tibble: 1 x 3
## positiveIncrease deathIncrease hospitalizedCurrently
## <dbl> <dbl> <dbl>
## 1 9761373 229238 9362912
##
## *** COMPARISONS TO REFERENCE FILE: compareFile
##
## Checkin for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checkin for similarity of: states
## In reference but not in current:
## In current but not in reference:
##
## Checkin for similarity of: dates
## In reference but not in current:
## In current but not in reference: 2020-11-07 2020-11-06 2020-11-05 2020-11-04 2020-11-03 2020-11-02 2020-11-01 2020-10-31 2020-10-30 2020-10-29 2020-10-28 2020-10-27 2020-10-26 2020-10-25
##
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("date", "name")
## date name newValue oldValue
## 1 2020-03-07 positiveIncrease 171 176
## 2 2020-03-11 positiveIncrease 502 509
## 3 2020-03-13 positiveIncrease 1059 1072
## 4 2020-03-18 positiveIncrease 3023 3089
## 5 2020-03-26 positiveIncrease 17544 17720
## 6 2020-03-28 positiveIncrease 19586 19925
## 7 2020-03-29 positiveIncrease 19570 19348
## 8 2020-03-30 positiveIncrease 21691 22042
## 9 2020-04-01 positiveIncrease 26078 25791
## 10 2020-04-06 positiveIncrease 28592 29002
## 11 2020-04-13 positiveIncrease 24758 25195
## 12 2020-04-15 positiveIncrease 29755 30307
## 13 2020-04-16 positiveIncrease 31489 30978
## 14 2020-04-24 positiveIncrease 33698 34231
## 15 2020-05-12 positiveIncrease 22520 22890
## 16 2020-05-13 positiveIncrease 21577 21285
## 17 2020-05-15 positiveIncrease 25371 24685
## 18 2020-05-16 positiveIncrease 23560 24702
## 19 2020-05-17 positiveIncrease 20344 20009
## 20 2020-05-18 positiveIncrease 20812 21028
## 21 2020-05-23 positiveIncrease 22167 21531
## 22 2020-05-24 positiveIncrease 19148 20072
## 23 2020-05-30 positiveIncrease 23443 23682
## 24 2020-06-04 positiveIncrease 20256 20886
## 25 2020-06-05 positiveIncrease 23004 23394
## 26 2020-06-06 positiveIncrease 22773 23064
## 27 2020-06-10 positiveIncrease 20637 20894
## 28 2020-06-12 positiveIncrease 23185 23597
## 29 2020-06-18 positiveIncrease 27135 27746
## 30 2020-06-19 positiveIncrease 30881 31471
## 31 2020-06-21 positiveIncrease 28991 27928
## 32 2020-06-23 positiveIncrease 33848 33447
## 33 2020-07-02 positiveIncrease 53385 54085
## 34 2020-07-06 positiveIncrease 41416 41959
## 35 2020-08-01 positiveIncrease 60416 61101
## 36 2020-08-08 positiveIncrease 53158 53712
## 37 2020-08-14 positiveIncrease 57254 55636
## 38 2020-08-22 positiveIncrease 45722 46236
## 39 2020-09-02 positiveIncrease 30287 30603
## 40 2020-09-07 positiveIncrease 28237 28682
## 41 2020-09-15 positiveIncrease 34879 35445
## 42 2020-09-19 positiveIncrease 44886 45564
## 43 2020-09-20 positiveIncrease 35688 36295
## 44 2020-09-21 positiveIncrease 39062 39472
## 45 2020-09-24 positiveIncrease 43243 43772
## 46 2020-09-27 positiveIncrease 35061 35454
## 47 2020-09-28 positiveIncrease 36056 36524
## 48 2020-09-29 positiveIncrease 36289 36947
## 49 2020-10-21 positiveIncrease 60657 58606
## 50 2020-10-22 positiveIncrease 72887 75248
## Joining, by = c("date", "name")
## Warning: Removed 14 row(s) containing missing values (geom_path).
##
##
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("state", "name")
## state name newValue oldValue
## 1 FL positiveIncrease 766305 776249
## 2 PR positiveIncrease 31067 61275
## 3 RI positiveIncrease 30581 30116
## Rows: 13,999
## Columns: 55
## $ date <date> 2020-11-07, 2020-11-07, 2020-11-07, 20...
## $ state <chr> "AK", "AL", "AR", "AS", "AZ", "CA", "CO...
## $ positive <dbl> 19306, 202482, 120828, 0, 257384, 95695...
## $ probableCases <dbl> NA, 30709, 10812, NA, 6590, NA, 7501, 4...
## $ negative <dbl> 728589, 1224595, 1308477, 1768, 1608041...
## $ pending <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestResultsSource <chr> "totalTestsViral", "totalTestsViral", "...
## $ totalTestResults <dbl> 747288, 1396368, 1418493, 1768, 1858835...
## $ hospitalizedCurrently <dbl> 105, 1015, 633, NA, 1139, 3456, 1041, 4...
## $ hospitalizedCumulative <dbl> NA, 21294, 7415, NA, 22170, NA, 9911, 1...
## $ inIcuCurrently <dbl> NA, NA, 235, NA, 249, 901, NA, NA, 25, ...
## $ inIcuCumulative <dbl> NA, 2121, NA, NA, NA, NA, NA, NA, NA, N...
## $ onVentilatorCurrently <dbl> 8, NA, 105, NA, 137, NA, NA, NA, 14, NA...
## $ onVentilatorCumulative <dbl> NA, 1225, 865, NA, NA, NA, NA, NA, NA, ...
## $ recovered <dbl> 7157, 84471, 106594, NA, 42950, NA, 847...
## $ dataQualityGrade <chr> "A", "A", "A+", "D", "A+", "B", "A", "B...
## $ lastUpdateEt <chr> "11/7/2020 03:59", "11/7/2020 11:00", "...
## $ dateModified <dttm> 2020-11-07 03:59:00, 2020-11-07 11:00:...
## $ checkTimeEt <chr> "11/06 22:59", "11/07 06:00", "11/06 19...
## $ death <dbl> 84, 3082, 2068, 0, 6147, 17939, 2168, 4...
## $ hospitalized <dbl> NA, 21294, 7415, NA, 22170, NA, 9911, 1...
## $ dateChecked <dttm> 2020-11-07 03:59:00, 2020-11-07 11:00:...
## $ totalTestsViral <dbl> 747288, 1396368, 1418493, 1768, NA, 195...
## $ positiveTestsViral <dbl> 21064, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ negativeTestsViral <dbl> 725850, NA, 1308477, NA, NA, NA, NA, NA...
## $ positiveCasesViral <dbl> 19306, 171773, 110016, 0, 250794, 95695...
## $ deathConfirmed <dbl> 84, 2864, 1890, NA, 5730, NA, NA, 3757,...
## $ deathProbable <dbl> NA, 218, 178, NA, 417, NA, NA, 914, NA,...
## $ totalTestEncountersViral <dbl> NA, NA, NA, NA, NA, NA, 2177091, NA, 55...
## $ totalTestsPeopleViral <dbl> NA, NA, NA, NA, 1858835, NA, 1303345, N...
## $ totalTestsAntibody <dbl> NA, NA, NA, NA, 324293, NA, 187631, NA,...
## $ positiveTestsAntibody <dbl> NA, NA, NA, NA, NA, NA, 14007, NA, NA, ...
## $ negativeTestsAntibody <dbl> NA, NA, NA, NA, NA, NA, 173624, NA, NA,...
## $ totalTestsPeopleAntibody <dbl> NA, 66104, NA, NA, NA, NA, NA, NA, NA, ...
## $ positiveTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ negativeTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestsPeopleAntigen <dbl> NA, NA, 76277, NA, NA, NA, NA, NA, NA, ...
## $ positiveTestsPeopleAntigen <dbl> NA, NA, 12392, NA, NA, NA, NA, NA, NA, ...
## $ totalTestsAntigen <dbl> NA, NA, 21856, NA, NA, NA, NA, 26512, N...
## $ positiveTestsAntigen <dbl> NA, NA, 3300, NA, NA, NA, NA, NA, NA, N...
## $ fips <chr> "02", "01", "05", "60", "04", "06", "08...
## $ positiveIncrease <dbl> 607, 1768, 1598, 0, 2620, 5863, 3463, 0...
## $ negativeIncrease <dbl> -34013, 7593, 10213, 0, 14060, 162939, ...
## $ total <dbl> 747895, 1427077, 1429305, 1768, 1865425...
## $ totalTestResultsIncrease <dbl> -34013, 8920, 11491, 0, 16524, 168802, ...
## $ posNeg <dbl> 747895, 1427077, 1429305, 1768, 1865425...
## $ deathIncrease <dbl> 0, 33, 12, 0, 38, 73, 10, 0, 2, 0, 87, ...
## $ hospitalizedIncrease <dbl> 0, 0, 14, 0, 152, 0, 197, 0, 0, 0, 161,...
## $ hash <chr> "f2176e93601204643e0618a661e7c3603f44f4...
## $ commercialScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeRegularScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ positiveScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ score <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ grade <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
##
##
## Control totals - note that validState other than TRUE will be discarded
##
## # A tibble: 2 x 6
## validState cases deaths hosp tests n
## <lgl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 FALSE 44159 976 NA 457982 1185
## 2 TRUE 9717214 228262 NA 154892254 12814
## Rows: 12,814
## Columns: 6
## $ date <date> 2020-11-07, 2020-11-07, 2020-11-07, 2020-11-07, 2020-11-07,...
## $ state <chr> "AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", ...
## $ cases <dbl> 607, 1768, 1598, 2620, 5863, 3463, 0, 99, 223, 4380, 1719, 1...
## $ deaths <dbl> 0, 33, 12, 38, 73, 10, 0, 2, 0, 87, 39, 0, 11, 8, 91, 45, 0,...
## $ hosp <dbl> 105, 1015, 633, 1139, 3456, 1041, 402, 77, 115, 2672, 1859, ...
## $ tests <dbl> -34013, 8920, 11491, 16524, 168802, 35158, 0, 4262, 5159, 48...
## Rows: 12,814
## Columns: 14
## $ date <date> 2020-01-22, 2020-01-22, 2020-01-23, 2020-01-23, 2020-01-24,...
## $ state <chr> "MA", "WA", "MA", "WA", "MA", "WA", "MA", "WA", "MA", "WA", ...
## $ cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hosp <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tests <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ cpm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ dpm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hpm <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm <dbl> 0.0000000, 0.0000000, 0.1471796, 0.0000000, 0.0000000, 0.000...
## $ cpm7 <dbl> NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, NA...
## $ dpm7 <dbl> NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, NA...
## $ hpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm7 <dbl> NA, NA, NA, NA, NA, NA, 0.04205130, 0.00000000, 0.06307695, ...
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'date', 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
##
## Recency is defined as 2020-10-09 through current
##
## Recency is defined as 2020-10-09 through current
## `summarise()` regrouping output by 'state', 'cluster', 'date' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
saveToRDS(test_old_201108, ovrWriteError=FALSE)
Cases appear to be spiking in some of the states in the segment that had previously been less impacted. Hospitalizations in aggregate in this segment are starting to slope upwards, though not yet at the same rate as the increase in cases.
New data from USA Facts are downloaded and assessed against the existing county-level segments:
# Locations for the population, case, and death file
popLoc <- "./RInputFiles/Coronavirus/covid_county_population_usafacts.csv"
caseLoc <- "./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20201109.csv"
deathLoc <- "./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201109.csv"
# Run old segments against new data
cty_old_20201109 <- readRunUSAFacts(maxDate="2020-11-07",
popLoc=popLoc,
caseLoc=caseLoc,
deathLoc=deathLoc,
dlCaseDeath=!(file.exists(caseLoc) & file.exists(deathLoc)),
oldFile=readFromRDS("cty_20201026")$dfBurden,
existingCountyClusters=readFromRDS("cty_20201026")$clustVec
)
##
## -- Column specification --------------------------------------------------------
## cols(
## countyFIPS = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## population = col_double()
## )
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## i Use `spec()` for the full column specifications.
## Rows: 929,745
## Columns: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumCases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## i Use `spec()` for the full column specifications.
## Warning: 1 parsing failure.
## row col expected actual file
## 1366 11/7/20 no trailing characters 1,020 './RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201109.csv'
## Rows: 929,745
## Columns: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumDeaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: Removed 2 rows containing missing values (geom_point).
## `summarise()` ungrouping output (override with `.groups` argument)
##
## Shapes will be created without any floor on the number of cases per million
## Shapes will be created without any floor on the number of deaths per million
## *** Counties with 0 cases/deaths or that fall below the floor for minCase/minDeath ***
## # A tibble: 1 x 4
## cpm_mean_is0 dpm_mean_is0 dpm_mean_ltDeath cpm_mean_ltCase
## <dbl> <dbl> <dbl> <dbl>
## 1 0 NA NA 0
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'date', 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).
##
## Recency is defined as 2020-10-09 through current
##
## Recency is defined as 2020-10-09 through current
## Warning: Removed 2 rows containing missing values (geom_point).
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## Joining, by = "fipsCounty"
## Joining, by = "fipsCounty"
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
saveToRDS(cty_old_20201109, ovrWriteError=FALSE)
##
## File already exists: ./RInputFiles/Coronavirus/cty_old_20201109.RDS
##
## Not replacing the existing file since ovrWrite=FALSE
## NULL
Cases appear to be growing in many of the county clusters, though deaths per million per day remain well below the peaks observed in April in the earlier-hit counties.
Next, all-cause death data from the CDC are downloaded and assessed:
# Use data that have previously been downloaded
cdcLoc <- "Weekly_counts_of_deaths_by_jurisdiction_and_age_group_downloaded_20201110.csv"
cdcList_20201110 <- readRunCDCAllCause(loc=cdcLoc,
startYear=2015,
curYear=2020,
weekThru=36,
startWeek=9,
lst=readFromRDS("test_old_201108"),
epiMap=readFromRDS("epiMonth"),
agePopData=readFromRDS("usPopBucket2020"),
cvDeathThru="2020-09-05",
cdcPlotStartWeek=10,
dlData=!file.exists(paste0("./RInputFiles/Coronavirus/", cdcLoc))
)
## Rows: 178,482
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "A...
## $ `Week Ending Date` <chr> "01/10/2015", "01/17/2015", "01/24/2015", "01/...
## $ `State Abbreviation` <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"...
## $ Year <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015...
## $ Week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ `Age Group` <chr> "25-44 years", "25-44 years", "25-44 years", "...
## $ `Number of Deaths` <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50...
## $ `Time Period` <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2...
## $ Type <chr> "Predicted (weighted)", "Predicted (weighted)"...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 178,482
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <chr> "25-44 years", "25-44 years", "25-44 years", "25-44 ye...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2019", "2...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
## Check Control Levels and Record Counts for Renamed Data:
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
## age n n_deaths_na deaths
## <chr> <int> <int> <dbl>
## 1 25-44 years 26725 5 3266337
## 2 45-64 years 32640 10 12792834
## 3 65-74 years 32632 12 12691956
## 4 75-84 years 32652 14 15776825
## 5 85 years and older 32640 15 20567595
## 6 Under 25 years 21193 0 1407550
## `summarise()` regrouping output by 'period', 'year' (override with `.groups` argument)
## # A tibble: 12 x 6
## # Groups: period, year [6]
## period year type n n_deaths_na deaths
## <chr> <int> <chr> <int> <int> <dbl>
## 1 2015-2019 2015 Predicted (weighted) 15285 0 5416393
## 2 2015-2019 2015 Unweighted 15285 0 5416393
## 3 2015-2019 2016 Predicted (weighted) 15365 0 5483764
## 4 2015-2019 2016 Unweighted 15365 0 5483764
## 5 2015-2019 2017 Predicted (weighted) 15317 0 5643342
## 6 2015-2019 2017 Unweighted 15317 0 5643342
## 7 2015-2019 2018 Predicted (weighted) 15305 0 5698005
## 8 2015-2019 2018 Unweighted 15305 0 5698005
## 9 2015-2019 2019 Predicted (weighted) 15319 0 5725544
## 10 2015-2019 2019 Unweighted 15319 0 5725544
## 11 2020 2020 Predicted (weighted) 12677 34 5332986
## 12 2020 2020 Unweighted 12623 22 5236015
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 3 x 5
## # Groups: period [2]
## period Suppress n n_deaths_na deaths
## <chr> <chr> <int> <int> <dbl>
## 1 2015-2019 <NA> 153182 0 5.59e7
## 2 2020 Suppressed (counts highly incomplete, <5~ 56 56 0.
## 3 2020 <NA> 25244 0 1.06e7
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 9 x 5
## # Groups: period [2]
## period Note n n_deaths_na deaths
## <chr> <chr> <int> <int> <dbl>
## 1 2015-20~ <NA> 153182 0 5.59e7
## 2 2020 Data in recent weeks are incomplete. Only~ 19873 10 8.61e6
## 3 2020 Data in recent weeks are incomplete. Only~ 456 0 2.12e5
## 4 2020 Data in recent weeks are incomplete. Only~ 384 35 4.69e4
## 5 2020 Data in recent weeks are incomplete. Only~ 2213 11 7.28e5
## 6 2020 Data in recent weeks are incomplete. Only~ 12 0 7.12e3
## 7 2020 Estimates for Pennsylvania are too low fo~ 48 0 2.26e4
## 8 2020 Weights may be too low to account for und~ 328 0 1.25e5
## 9 2020 <NA> 1986 0 8.13e5
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## state Jurisdiction n n_deaths_na deaths
## 1 US United States 3636 0 33131804
## 2 CA California 3636 0 3152950
## 3 FL Florida 3636 0 2423089
## 4 TX Texas 3636 0 2383305
## 5 PA Pennsylvania 3636 0 1589393
## 6 OH Ohio 3636 0 1432144
## 7 IL Illinois 3636 0 1250417
## 8 NY New York 3636 0 1183446
## 9 MI Michigan 3636 0 1140202
## 10 NC North Carolina 3568 28 1066679
## 11 GA Georgia 3636 0 994220
## 12 NJ New Jersey 3630 0 886576
## 13 TN Tennessee 3636 0 865673
## 14 VA Virginia 3636 0 795842
## 15 IN Indiana 3632 0 769795
## 16 MO Missouri 3633 0 749309
## 17 AZ Arizona 3636 0 702644
## 18 MA Massachusetts 3602 0 701039
## 19 YC New York City 3632 0 685046
## 20 WA Washington 3636 0 662228
## 21 AL Alabama 3635 0 615046
## 22 WI Wisconsin 3618 0 608567
## 23 MD Maryland 3630 0 585192
## 24 SC South Carolina 3634 0 576703
## 25 KY Kentucky 3599 0 560150
## 26 LA Louisiana 3628 0 541388
## 27 MN Minnesota 3592 0 517087
## 28 CO Colorado 3634 0 458526
## 29 OK Oklahoma 3622 9 455633
## 30 OR Oregon 3466 0 424398
## 31 MS Mississippi 3574 0 374274
## 32 AR Arkansas 3532 0 372266
## 33 CT Connecticut 3191 14 365303
## 34 IA Iowa 3271 0 349432
## 35 PR Puerto Rico 3347 0 340685
## 36 KS Kansas 3327 0 304875
## 37 NV Nevada 3372 0 297386
## 38 WV West Virginia 3075 2 257638
## 39 UT Utah 3522 0 219851
## 40 NM New Mexico 3210 0 210537
## 41 NE Nebraska 2920 0 193764
## 42 ME Maine 2712 0 164439
## 43 ID Idaho 2834 0 157266
## 44 NH New Hampshire 2734 0 138417
## 45 HI Hawaii 2625 0 127727
## 46 RI Rhode Island 2534 3 116993
## 47 MT Montana 2618 0 113028
## 48 DE Delaware 2627 0 102300
## 49 SD South Dakota 2510 0 89238
## 50 ND North Dakota 2496 0 77815
## 51 DC District of Columbia 2609 0 65724
## 52 VT Vermont 2392 0 63461
## 53 WY Wyoming 2379 0 48740
## 54 AK Alaska 2412 0 43447
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## Rows: 178,482
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 87,264
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
##
## *** Data suppression checks ***
## # A tibble: 5 x 11
## Jurisdiction weekEnding state year week age deaths period type Suppress
## <chr> <date> <chr> <fct> <int> <fct> <dbl> <fct> <chr> <chr>
## 1 North Carol~ 2020-09-05 NC 2020 36 25-4~ NA 2020 Pred~ Suppres~
## 2 North Carol~ 2020-09-05 NC 2020 36 45-6~ NA 2020 Pred~ Suppres~
## 3 North Carol~ 2020-09-05 NC 2020 36 65-7~ NA 2020 Pred~ Suppres~
## 4 North Carol~ 2020-09-05 NC 2020 36 75-8~ NA 2020 Pred~ Suppres~
## 5 North Carol~ 2020-09-05 NC 2020 36 85 y~ NA 2020 Pred~ Suppres~
## # ... with 1 more variable: Note <chr>
##
## Data suppression checks OK - 5 records in current week/year suppressed
## `summarise()` regrouping output by 'Jurisdiction', 'weekEnding', 'state', 'year', 'week', 'age', 'period', 'type' (override with `.groups` argument)
## Rows: 82,069
## Columns: 12
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, ...
## $ age <fct> Under 25 years, 25-44 years, 45-64 years, 65-74 years,...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ n <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ deaths <dbl> 25, 67, 253, 202, 272, 320, 28, 49, 256, 222, 253, 332...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
## First duplicate is in row number (0 means no duplicates): 0
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year', 'week' (override with `.groups` argument)
## `summarise()` regrouping output by 'year', 'week' (override with `.groups` argument)
## `summarise()` regrouping output by 'year', 'age', 'week' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'state', 'quarter', 'month' (override with `.groups` argument)
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "state"
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'age', 'quarter', 'month' (override with `.groups` argument)
## `summarise()` regrouping output by 'age' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
saveToRDS(cdcList_20201110, ovrWriteError=FALSE)
##
## File already exists: ./RInputFiles/Coronavirus/cdcList_20201110.RDS
##
## Not replacing the existing file since ovrWrite=FALSE
## NULL
The death data can then be combined and analyzed, using the function from Coronavirus_Statistics_States_v003:
combineDeathData <- function(ctp,
usaf,
cdc,
keyState,
curYear=2020,
minDate=as.Date(paste0(curYear, "-01-01")),
perMillion=FALSE,
glimpseIntermediate=FALSE,
facetFreeY=!perMillion,
returnData=TRUE
) {
# FUNCTION ARGUMENTS:
# ctp: the list with COVID Tracking Project data
# usaf: the data frame with USA Facts data
# cdc: the list with CDC data
# keyState: the state(s) to be explored
# curYear: current year
# minDate: the minimum date to use in the CDC data
# perMillion: boolean, should data be show on a per-million-people basis?
# glimpseIntermediate: boolean, should glimpses of frames be provided as they are built?
# facetFreeY: boolean, should facets be created with free_y scales (only relevant if 2+ keyStates)
# returnData: boolean, should the data frame be returned?
# STEP 0a: Extract relevant elements from lists (use frame as-is if directly passed)
if ("list" %in% class(ctp)) ctp <- ctp[["consolidatedPlotData"]]
if ("list" %in% class(usaf)) usaf <- usaf[["clusterStateData"]]
if ("list" %in% class(cdc)) cdc <- cdc[["stateAgg"]]
# STEP 0b: Create a mapping file of date to epiWeek
epiMap <- tibble::tibble(date=seq.Date(from=minDate, to=as.Date(paste0(curYear, "-12-31")), by="1 day"),
week=lubridate::epiweek(date)
)
# STEP 1: Filter to only relevant data
# STEP 1a: COVID Tracking Project
ctp <- ctp %>%
ungroup() %>%
filter(name=="deaths", state %in% keyState)
if(glimpseIntermediate) glimpse(ctp)
# STEP 1b: USA Facts
usaf <- usaf %>%
ungroup() %>%
filter(state %in% keyState)
if(glimpseIntermediate) glimpse(usaf)
# STEP 1c: CDC
cdc <- cdc %>%
ungroup() %>%
filter(year==curYear, state %in% keyState)
if(glimpseIntermediate) glimpse(cdc)
# STEP 2a: Sum the county-level data so that it is state-level data
usafState <- usaf %>%
group_by(state, date) %>%
summarize(deaths=sum(deaths), dpm7=sum(dpm7*pop)/sum(pop), pop=sum(pop), .groups="drop_last") %>%
ungroup()
# STEP 2b: Convert the CDC data to an estimated daily total (split the weekly total evenly)
cdcDaily <- cdc %>%
left_join(epiMap, by=c("week")) %>%
select(state, week, date, cdcDeaths=deaths, cdcExcess=delta) %>%
mutate(cdcDeaths=cdcDeaths/7, cdcExcess=cdcExcess/7)
# STEP 3: Create a state death-level database by date
dailyDeath <- select(ctp, state, date, ctpDeaths=value, ctp_dpm7=vpm7, ctp_pop=pop) %>%
full_join(select(usafState, state, date, usafDeaths=deaths, usaf_dpm7=dpm7, usaf_pop=pop),
by=c("state", "date")
) %>%
full_join(cdcDaily, by=c("state", "date")) %>%
arrange(state, date) %>%
mutate(ctp_death7=ctp_dpm7*ctp_pop/1000000, usaf_death7=usaf_dpm7*usaf_pop/1000000)
if(glimpseIntermediate) glimpse(dailyDeath)
# STEP 4a: Assign a population by state
statePop <- dailyDeath %>%
group_by(state) %>%
summarize(pop=max(usaf_pop, ctp_pop, na.rm=TRUE), .groups="drop_last")
# STEP 4b: Plot the deaths data
p1 <- dailyDeath %>%
select(state, date, ctp_death7, usaf_death7, cdcExcess) %>%
pivot_longer(-c(state, date), names_to="source", values_to="deaths") %>%
filter(!is.na(deaths)) %>%
left_join(statePop, by="state") %>%
ggplot(aes(x=date, y=deaths*if(perMillion) (1000000/pop) else 1)) +
geom_line(aes(group=source, color=varMapper[source])) +
labs(x="",
y=paste0("Deaths", if(perMillion) " per million" else ""),
title=paste0(curYear, " deaths per day in ", paste0(keyState, collapse=", ")),
subtitle=paste0("Rolling 7-day average", if(perMillion) " per million people" else ""),
caption="CDC estimated excess all-cause deaths, weekly total divided by 7 to estimate daily total"
) +
scale_x_date(date_breaks="1 month", date_labels="%b") +
scale_color_discrete("Data source") +
theme(legend.position="bottom") +
geom_hline(yintercept=0, lty=2)
if (length(keyState) > 1) p1 <- p1 + facet_wrap(~state, scales=if(facetFreeY) "free_y" else "fixed")
print(p1)
# STEP 5: Return the daily death file
if(returnData) dailyDeath
}
# Example function
combineDeathData(ctp=test_old_201108,
usaf=cty_old_20201109$clusterStateData,
cdc=cdcList_20201110,
keyState=c("NY", "NJ", "MA", "FL", "GA", "TX", "AZ", "MS", "LA", "MI", "IL", "WI"),
perMillion=FALSE,
returnData=FALSE
)
combineDeathData(ctp=test_old_201108,
usaf=cty_old_20201109$clusterStateData,
cdc=cdcList_20201110,
keyState=c("NY", "NJ", "MA", "FL", "GA", "TX", "AZ", "MS", "LA", "MI", "IL", "WI"),
perMillion=TRUE,
returnData=FALSE
)
In general, the shapes of the curves are well aligned, though the CDC excess-deaths generally start earlier and peak higher than the coronavirus deaths curves from COVID Tracking Project and USA Facts. In general, the northeastern states hit early have a classic epidemic peak curve, while the states hit later tend to have lower peaks (per million) but, in some cases, a much extended timeline of non-zero disease impact.
Next, an integrated state data file is created from the latest data:
ctpList <- readFromRDS("test_old_201108")
usafData <- readFromRDS("cty_old_20201109")$clusterStateData
cdcList <- readFromRDS("cdcList_20201110")
# Function to convert a COVID Tracking Project file for further processing
prepCTPData <- function(ctp) {
# FUNCTION AGRUMENTS:
# ctp: a properly formatted list or data frame containing processed COVID Tracking Project data
# Pull the relevant data frame if a list has been passed
if ("list" %in% class(ctp)) ctp <- ctp[["consolidatedPlotData"]]
# Ungroup the data, delete the state named 'cluster', and Create a value7 metric
ctp <- ctp %>%
ungroup() %>%
filter(state != "cluster") %>%
mutate(value7=ifelse(is.na(vpm7), NA, vpm7*pop/1000000))
# Split state-cluster-population as a separate file unique by state
ctpDemo <- ctp %>%
group_by(state, cluster) %>%
summarize(pop=max(pop, na.rm=TRUE), .groups="drop_last") %>%
ungroup()
# Create a final data file with the key elements
ctpData <- ctp %>%
rename(metric=name) %>%
mutate(source="CTP", name=paste0(source, "_", metric)) %>%
select(state, date, metric, source, name, value, value7, vpm, vpm7)
# Return the key data frames
list(ctpDemo=ctpDemo, ctpData=ctpData)
}
ctpPrepped <- prepCTPData(ctpList)
# Function to convert a USA Facts file for further processing
prepUSAFData <- function(usaf) {
# FUNCTION AGRUMENTS:
# usaf: a properly formatted list or data frame containing processed USA Facts data
# Pull the relevant data frame if a list has been passed
if ("list" %in% class(usaf)) usaf <- usaf[["clusterStateData"]]
# Sum the data to state, keeping only state-date-pop-cases-deaths, then pivot longer
usaf <- usaf %>%
group_by(state, date) %>%
summarize(cases=sum(cases), deaths=sum(deaths), pop=sum(pop), .groups="drop_last") %>%
ungroup() %>%
pivot_longer(-c(state, date, pop), names_to="metric", values_to="value")
# Create the rolling-7 for value, having grouped by state-pop-metric and sorted by date
# Add the per million component
usaf <- usaf %>%
group_by(state, pop, metric) %>%
arrange(date) %>%
helperRollingAgg(origVar="value", newName="value7") %>%
ungroup() %>%
mutate(vpm=value*1000000/pop, vpm7=value7*1000000/pop)
# Split state-pop as a separate file unique by state
usafDemo <- usaf %>%
group_by(state) %>%
summarize(pop=max(pop, na.rm=TRUE), .groups="drop_last") %>%
ungroup()
# Create a final data file with the key elements
usafData <- usaf %>%
mutate(source="USAF", name=paste0(source, "_", metric)) %>%
select(state, date, metric, source, name, value, value7, vpm, vpm7)
# Return the key data frames
list(usafDemo=usafDemo, usafData=usafData)
}
usafPrepped <- prepUSAFData(usafData)
# Function to convert a CDC file for further processing
prepCDCData <- function(cdc,
popData,
startYear=2020,
startDate=as.Date(paste0(startYear, "-01-01")),
endDate=as.Date(paste0(startYear, "-12-31"))
) {
# FUNCTION AGRUMENTS:
# cdc: a properly formatted list or data frame containing processed CDC data
# popData: a file containing fields state-pop
# startYear: starting year (CDC data will be filtered for this year and later)
# startDate: the starting date for use in the mapping file to create daily estimates
# endDate: the ending date for use in the mapping file to create daily estimates
# Pull the relevant data frame if a list has been passed
if ("list" %in% class(cdc)) cdc <- cdc[["stateAgg"]]
# Create a mapping file of dates to epiweek-epiyear
epiMap <- tibble::tibble(date=seq.Date(from=startDate, to=endDate, by="1 day"),
year=lubridate::epiyear(date),
week=lubridate::epiweek(date)
)
# Filter the data to the relevant year and keep state-year-week-deaths-excess
cdc <- cdc %>%
filter(yearint >= startYear) %>%
select(state, yearint, week, deaths, excess=delta)
# Merge in the daily mapping file, divide all totals by 7 to reflect weekly to daily, and pivot longer
cdc <- cdc %>%
left_join(epiMap, by=c("yearint"="year", "week"="week")) %>%
mutate(deaths=deaths/7, excess=excess/7) %>%
select(state, date, deaths, excess) %>%
pivot_longer(-c(state, date), names_to="metric", values_to="value")
# Create the rolling-7 for value, having grouped by state-metric and sorted by date
# Add the per million component
cdc <- cdc %>%
group_by(state, metric) %>%
arrange(date) %>%
helperRollingAgg(origVar="value", newName="value7") %>%
ungroup() %>%
left_join(select(popData, state, pop), by="state") %>%
mutate(vpm=value*1000000/pop,
vpm7=value7*1000000/pop,
source="CDC",
name=paste0(source, "_", metric)
) %>%
select(state, date, pop, metric, source, name, value, value7, vpm, vpm7)
# Return the key data frame as a list
list(cdcDemo=select(cdc, state, pop), cdcData=select(cdc, -pop))
}
# Create an integrated state demographics file
demoData <- ctpPrepped$ctpDemo %>%
rename(popCTP=pop) %>%
full_join(rename(usafPrepped$usafDemo, popUSAF=pop), by="state") %>%
mutate(pop=pmax(popCTP, popUSAF))
cdcPrepped <- prepCDCData(cdcList, popData=demoData)
# Integrated state data
stateData <- ctpPrepped$ctpData %>%
bind_rows(usafPrepped$usafData) %>%
bind_rows(cdcPrepped$cdcData)
glimpse(stateData)
## Rows: 104,103
## Columns: 9
## $ state <chr> "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", ...
## $ date <date> 2020-03-06, 2020-03-07, 2020-03-08, 2020-03-09, 2020-03-10,...
## $ metric <chr> "cases", "cases", "cases", "cases", "cases", "cases", "cases...
## $ source <chr> "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP...
## $ name <chr> "CTP_cases", "CTP_cases", "CTP_cases", "CTP_cases", "CTP_cas...
## $ value <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 3, 0, 6, 2, 8, 0, 14, 6,...
## $ value7 <dbl> NA, NA, NA, 0.0000000, 0.1428571, 0.1428571, 0.1428571, 0.14...
## $ vpm <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, ...
## $ vpm7 <dbl> NA, NA, NA, 0.0000000, 0.1934601, 0.1934601, 0.1934601, 0.19...
# Control totals
stateData %>%
group_by(name) %>%
summarize(value=sum(value, na.rm=TRUE), value7=sum(value7, na.rm=TRUE), .groups="drop_last")
## # A tibble: 8 x 3
## name value value7
## <chr> <dbl> <dbl>
## 1 CDC_deaths 2224832. 2173735.
## 2 CDC_excess 242466. 240886.
## 3 CTP_cases 9717214 9372412.
## 4 CTP_deaths 228262 224869.
## 5 CTP_hosp 9301372 9036822.
## 6 CTP_tests 154892254 150735326.
## 7 USAF_cases 9679664 9336904.
## 8 USAF_deaths 233286 230182.
The alignment of cases and deaths data can then be plotted:
plotStateMetric <- function(df,
yVal,
namesPlot,
keyStates,
namesSec=NULL,
scaleSec=NULL,
plotTitle=NULL,
plotSub=NULL,
plotCaption=NULL,
primYLab=NULL,
secYLab="Caution, different metric and scale",
facetFixed=TRUE,
mapper=varMapper,
combStates=vector("character", 0),
popData=NULL,
yValPerCap=(yVal %in% c("vpm", "vpm7")),
printPlot=TRUE,
returnData=FALSE
) {
# FUNCTION ARGUMENTS:
# df: data frame with integrated state data
# yVal: column to use for the yValues
# namesPlot: the values of column 'name' to be kept and plotted
# keyStates: states to be included
# if more than one state is passed, facets will be created
# namesSec: names to be plotted on a secondary y-axes
# scaleSec: scale to be used for the secondary axis
# namesSec/scaleSec should be similar in magnitude to namesPlot
# plotTitle: plot title to be used (NULL means none)
# plotSub: plot subtitle to be used (NULL means none)
# plotCaption: plot caption to be used (NULL means none)
# primYLab: primary y label (NULL means use mapper)
# secYLab: secondary y label (default is "Caution, different metric and scale")
# facetFixed: boolean, if TRUE scales="fixed", if FALSE scales="free_y"
# only relevant if length(keyStates) > 1
# mapper: mapping file for variable names to descriptive names
# combStates: states that should be combined together for plotting (named vector, c("state"="newName"))
# popData: a population file for combining states
# yValPerCap: boolean, is the y-value of type per-capita?
# printPlot: boolean, whether to print the plots
# returnData: boolean, whether to return the data
# Routine is only set up for a secondary axis with facetFixed=TRUE
if (!is.null(namesSec) & !facetFixed) stop("\nSecondary axis only programmed for scales='fixed'\n")
# Include variables in namesSec as part of namesPlot so they are kept by filter
if (!is.null(namesSec)) namesPlot <- unique(c(namesPlot, namesSec))
# Filter the data for only the key elements
df <- df %>%
select_at(vars(all_of(c("state", "date", "name", yVal)))) %>%
filter(state %in% keyStates, name %in% namesPlot)
# If there is a list of states to combine, process them
if (length(combStates) > 0) {
if (is.null(popData)) { stop("\nCombining states requires population data\n") }
# Create a data frame with population and new state names
df <- df %>%
left_join(select(popData, state, pop), by="state") %>%
mutate(state=ifelse(state %in% names(combStates), combStates[state], state))
# Aggregate to the new 'state' level data
if (yValPerCap) {
df <- df %>%
group_by(state, date, name) %>%
filter(!is.na(get(yVal))) %>% # only sum population where yVal exists
summarize(!!yVal:=sum(get(yVal)*pop)/sum(pop), pop=sum(pop), .groups="drop_last")
} else {
df <- df %>%
group_by(state, date, name) %>%
filter(!is.na(get(yVal))) %>% # only sum population where yVal exists
summarize(!!yVal:=sum(get(yVal)), pop=sum(pop), .groups="drop_last")
}
# Ungroup data frame
df <- df %>%
ungroup()
}
# If there is a secondary scale but no scaleSec has been passed, create one
if (!is.null(namesSec) & is.null(scaleSec)) {
maxPrimary <- df %>%
filter(name %in% setdiff(namesPlot, namesSec)) %>%
summarize(max(get(yVal), na.rm=TRUE), .groups="drop_last") %>%
max()
maxSecondary <- df %>%
filter(name %in% namesSec) %>%
summarize(max(get(yVal), na.rm=TRUE), .groups="drop_last") %>%
max()
scaleSec <- maxSecondary/maxPrimary
cat("\nWill scale by:", scaleSec, "\n")
}
# Create the primary y-axis label from mapper if it has not been passed
if (is.null(primYLab)) primYLab <- mapper[yVal]
# Create the relevant line plot
if (printPlot) {
p1 <- df %>%
filter(!is.na(get(yVal))) %>%
ggplot(aes_string(x="date")) +
geom_line(data=~filter(., name %in% setdiff(namesPlot, namesSec)),
aes(y=get(yVal), group=name, color=mapper[name])
) +
scale_x_date(date_breaks="1 month", date_labels="%b") +
geom_hline(aes(yintercept=0), lty=2) +
labs(x="") +
theme(axis.text.x = element_text(angle = 90))
if (!is.null(namesSec)) {
p1 <- p1 +
geom_line(data=~filter(., name %in% namesSec),
aes(y=get(yVal)/scaleSec, color=mapper[name], group=name)
) +
scale_y_continuous(name=primYLab,
sec.axis=sec_axis(~.*scaleSec, name=secYLab)
)
} else {
p1 <- p1 + scale_y_continuous(name=primYLab)
}
if (length(keyStates) > 1) p1 <- p1 + facet_wrap(~state, scales=if(facetFixed) "fixed" else "free_y")
if (!is.null(plotTitle)) p1 <- p1 + labs(title=plotTitle)
if (!is.null(plotSub)) p1 <- p1 + labs(subtitle=plotSub)
if (!is.null(plotCaption)) p1 <- p1 + labs(caption=plotCaption)
p1 <- p1 + scale_color_discrete("Source and metric")
print(p1)
}
if (returnData) return(df)
}
# Example of combining states
ne_casedeath <- plotStateMetric(stateData,
yVal="vpm7",
namesPlot=c("CTP_cases"),
namesSec=c("CTP_deaths"),
keyStates=c("NY", "NJ", "MA", "CT", "RI", "NH", "VT", "ME"),
combStates=c("MA"="S NE", "CT"="S NE", "RI"="S NE",
"NH"="N NE", "VT"="N NE", "ME"="N NE"
),
plotTitle="2020 coronavirus burden per million per day (select states)",
plotSub="Cases on main y-axis, deaths on secondary y-axis",
primYLab="Cases per million (7-day rolling mean)",
secYLab="Deaths per million (7-day rolling mean)",
facetFixed=TRUE,
popData=usafPrepped$usafDemo,
returnData=TRUE
)
##
## Will scale by: 0.07901078
ne_casedeath
## # A tibble: 2,076 x 5
## state date name vpm7 pop
## <chr> <date> <chr> <dbl> <dbl>
## 1 N NE 2020-03-06 CTP_cases 0.228 626042
## 2 N NE 2020-03-06 CTP_deaths 0 626042
## 3 N NE 2020-03-07 CTP_cases 0.219 1956650
## 4 N NE 2020-03-07 CTP_deaths 0 1956650
## 5 N NE 2020-03-08 CTP_cases 0.292 1956650
## 6 N NE 2020-03-08 CTP_deaths 0 1956650
## 7 N NE 2020-03-09 CTP_cases 0.438 1956650
## 8 N NE 2020-03-09 CTP_deaths 0 1956650
## 9 N NE 2020-03-10 CTP_cases 0.522 3285978
## 10 N NE 2020-03-10 CTP_deaths 0 3285978
## # ... with 2,066 more rows
An attempt is made to align the curves for two different metrics in a single locale:
alignCurves <- function(df,
valueMetric,
depName,
indepName=setdiff(unique(df$name), depName),
lagsTry=0:30,
yLabel="Deaths per million",
depLabel="cases",
textMetric=stringr::str_split(yLabel, pattern=" ")[[1]][1] %>% stringr::str_to_lower()
) {
# FUNCTION ARGUMENTS
# df: a data frame containing state-date-name-valueMetric, with only 2 value types in 'name'
# valueMetric: the name of the value metric
# depName: the name of the dependent variable (the other will be the predictor)
# indepName: the name of the predictor variable
# lagsTry: the lagged values to attempt
# yLabel: label for the y-axis
# depLabel: label for the title (regression x-variable name)
# textMetric: label for the title (regression y-variable name)
# Check that there are only two values in column 'name'
if (length(unique(df$name))!=2) { stop("\nFunction depends on 'name' having only two possible values\n") }
# Arrange the data by state and date
df <- df %>%
arrange(state, date)
# Function to make a data frame with a specific lag
helperMakeLagData <- function(df, depName, indepName, valueMetric, lagValue) {
depData <- df %>%
filter(name==depName) %>%
select_at(vars(all_of(c("state", "date", valueMetric)))) %>%
purrr::set_names(c("state", "date", "depVar"))
indepData <- df %>%
filter(name==indepName) %>%
group_by(state) %>%
mutate(indepVar=lag(get(valueMetric), lagValue)) %>%
ungroup() %>%
select(state, date, indepVar)
fullData <- depData %>%
full_join(indepData, by=c("state", "date"))
fullData
}
# Run a simple linear model for depName ~ lag(otherName, lagsTry) to assess performance
lmResults <- vector("list", length(lagsTry))
n <- 1
for (lagValue in lagsTry) {
# Run the linear model with no intercept, save, and increment
lmResults[[n]] <- lm(depVar ~ indepVar:state + 0,
data=helperMakeLagData(df,
depName=depName,
indepName=indepName,
valueMetric=valueMetric,
lagValue=lagValue
)
)
n <- n + 1
}
# Find the best lag and coefficients
dfResults <- tibble::tibble(lags=lagsTry,
rsq=sapply(lmResults, FUN=function(x) summary(x)$r.squared)
)
p1 <- dfResults %>%
ggplot(aes(x=lags, y=rsq)) +
geom_point() +
labs(x="Lag", y="R-squared", title="R-squared vs. lag for aligning curves")
print(p1)
# Calculate the best lag and coefficients
bestLag <- dfResults %>%
filter(rsq==max(rsq)) %>%
pull(lags)
bestCoef <- coef(lmResults[[which(lagsTry==bestLag)]]) %>%
as.data.frame() %>%
purrr::set_names("mult") %>%
tibble::rownames_to_column("state") %>%
mutate(state=str_replace(state, "indepVar:state", ""))
# Plot the curves using the coefficients and lags
bestDF <- helperMakeLagData(df,
depName=depName,
indepName=indepName,
valueMetric=valueMetric,
lagValue=bestLag
) %>%
filter(!is.na(indepVar)) %>%
left_join(bestCoef, by="state") %>%
mutate(pred=mult*indepVar)
p2 <- bestDF %>%
select(state, date, depVar, pred, mult) %>%
pivot_longer(-c(state, date, mult)) %>%
mutate(name=case_when(name=="depVar" ~ "Actual value",
name=="pred" ~ "Predicted value\n(lag, mult)",
TRUE ~ "Unknown Element"
)
) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(group=name, color=name)) +
geom_text(data=~filter(., date==max(date)),
aes(x=date, y=+Inf, label=paste0("Multiplier: ", round(mult, 3))),
hjust=1,
vjust=1
) +
labs(x="",
y=yLabel,
title=paste0("Predicting ",
textMetric,
" based on lagged ",
depLabel,
" (best lag: ",
bestLag,
" days)"
),
subtitle="Best lag is based on highest correlation/R-squared, common across all facets"
) +
facet_wrap(~state) +
scale_x_date(date_breaks="1 month", date_labels="%b") +
theme(axis.text.x = element_text(angle = 90)) +
scale_color_discrete("Metric")
print(p2)
# Return the key data
list(bestLag=bestLag, bestCoef=bestCoef, bestDF=bestDF, lmResults=lmResults)
}
createAndAlignCurves <- function(df,
yVal,
namesPlot,
keyStates,
lagValueMetric,
lagDepName,
namesSec=NULL,
scaleSec=NULL,
plotTitle=NULL,
plotSub=NULL,
plotCaption=NULL,
primYLab=NULL,
secYLab="Caution, different metric and scale",
facetFixed=TRUE,
mapper=varMapper,
combStates=vector("character", 0),
popData=NULL,
yValPerCap = (yVal %in% c("vpm", "vpm7")),
printPlot=TRUE,
...
) {
# FUNCTION ARGUMENTS:
# df: data frame with integrated state data
# yVal: column to use for the yValues
# namesPlot: the values of column 'name' to be kept and plotted
# keyStates: states to be included
# if more than one state is passed, facets will be created
# lagValueMetric: the metric to be used for checking lags (typically 'vpm7')
# lagDepName: dependent variable (records in column 'name') to be used for the lagging process
# namesSec: names to be plotted on a secondary y-axes
# scaleSec: scale to be used for the secondary axis
# namesSec/scaleSec should be similar in magnitude to namesPlot
# plotTitle: plot title to be used (NULL means none)
# plotSub: plot subtitle to be used (NULL means none)
# plotCaption: plot caption to be used (NULL means none)
# primYLab: primary y label (NULL means use mapper)
# secYLab: secondary y label (default is "Caution, different metric and scale")
# facetFixed: boolean, if TRUE scales="fixed", if FALSE scales="free_y"
# only relevant if length(keyStates) > 1
# mapper: mapping file for variable names to descriptive names
# combStates: states that should be combined together for plotting (named vector, c("state"="newName"))
# popData: a population file for combining states
# yValPerCap: boolean, is the y-value of type per-capita?
# printPlot: boolean, whether to print the plots
# ...: other arguments to be passed to alignCurves()
# Create a frame to be used by the lagging process
tempMetrics <- plotStateMetric(df,
yVal=yVal,
namesPlot=namesPlot,
keyStates=keyStates,
namesSec=namesSec,
scaleSec=scaleSec,
plotTitle=plotTitle,
plotSub=plotSub,
plotCaption=plotCaption,
primYLab=primYLab,
secYLab=secYLab,
facetFixed=facetFixed,
mapper=mapper,
combStates=combStates,
popData=popData,
yValPerCap=yValPerCap,
printPlot=printPlot,
returnData=TRUE # the data must be returned for the next function
)
# Run the lagging process
tempLM <- alignCurves(tempMetrics, valueMetric=lagValueMetric, depName=lagDepName, ...)
# Return the key values
list(dfList=tempMetrics, lmList=tempLM)
}
# Example for northeastern states
neCurveList <- createAndAlignCurves(stateData,
yVal="vpm7",
namesPlot=c("CTP_cases"),
namesSec=c("CTP_deaths"),
keyStates=c("NY", "NJ", "MA", "CT", "RI", "NH", "VT", "ME", "DE", "DC"),
combStates=c("MA"="S NE", "CT"="S NE", "RI"="S NE",
"NH"="N NE", "VT"="N NE", "ME"="N NE",
"NY"="NY/NJ", "NJ"="NY/NJ",
"DE"="DE/DC", "DC"="DE/DC"
),
plotTitle="2020 coronavirus burden per million per day (select states)",
plotSub="Cases on main y-axis, deaths on secondary y-axis",
primYLab="Cases per million (7-day rolling mean)",
secYLab="Deaths per million (7-day rolling mean)",
facetFixed=TRUE,
popData=usafPrepped$usafDemo,
printPlot=TRUE,
lagValueMetric="vpm7",
lagDepName="CTP_deaths",
lagsTry=0:30
)
##
## Will scale by: 0.07975869
# Example for midwestern states
mwCurveList <- createAndAlignCurves(stateData,
yVal="vpm7",
namesPlot=c("CTP_cases"),
namesSec=c("CTP_deaths"),
keyStates=c("OH", "MI", "IN", "IL"),
plotTitle="2020 coronavirus burden per million per day (select states)",
plotSub="Cases on main y-axis, deaths on secondary y-axis",
primYLab="Cases per million (7-day rolling mean)",
secYLab="Deaths per million (7-day rolling mean)",
facetFixed=TRUE,
popData=usafPrepped$usafDemo,
printPlot=TRUE,
lagValueMetric="vpm7",
lagDepName="CTP_deaths",
lagsTry=0:30
)
##
## Will scale by: 0.02321167
## Warning: Removed 3 row(s) containing missing values (geom_path).
The midwest is challenging to align. If using a single value for lag and a single value for CFR (case fatality rate), then predictions will have far too few deaths in the early months and far too many deaths in the later months.
The changes in CFR over time can also be estimated:
# Updated for automatic lag time assessment
assessStateCFR <- function(lst,
keyStates,
depVar,
indepVar,
depTitleName,
indepTitleName,
keyMetric="vpm7",
lagEarlyDate=as.Date("2020-03-31"),
lagMidDate=NULL,
lagLateDate=as.Date("2020-10-15"),
lagEarlyValue=10,
lagLateValue=20,
lagsTry=0:30
) {
# FUNCTION ARGUMENTS:
# lst: A list such as produced by createAndAlignCurves()
# keyStates: The key states to be extracted from the list
# depVar: the dependent variable
# indepVar: the independent variable
# depTitleName: the name for the dependent variable in the title
# indepTitleName: the name for the independent variable in the plot title
# keyMetric: the name of the key metric that is being assessed
# lagEarlyDate: the date for the earliest lagging calculation (dates before this will be at lagEarlyValue)
# lagMidDate: if lags are found from data, what midpoint should be used to split data as early vs late?
# NULL means midway between lagEarlyDate and lagLateDate
# lagLateDate: the date for the latest lagging calculation (dates after this will be at lagLateValue)
# lagEarlyValue: the value for lag on lagEarlyDate, will be linearly interpolated to lagLateValue/Date
# NULL means calculate from data and may differ by state
# lagLateValue: the value for lag on lagLateDate, will be linearly interpolated from lagEarlyValue/Date
# NULL means estimate from data and may differ by state
# lagsTry: the values for lag to be attempted if lageEarlyValue and/or lagLateValue is NULL
# Extract the data for keyStates
df <- lst[["dfList"]] %>%
filter(state %in% keyStates, !is.na(get(keyMetric))) %>%
pivot_wider(names_from="name", values_from=keyMetric)
# Function for finding lag time correlations
helperLagCor <- function(lt, lf, dp, id) {
lf %>%
group_by(state) %>%
mutate(y=get(dp), x=lag(get(id), lt)) %>%
summarize(p=cor(x, y, use="complete.obs"), .groups="drop_last") %>%
ungroup() %>%
mutate(lag=lt)
}
# Middle date for splitting data
if (is.null(lagMidDate)) lagMidDate <- mean(c(lagEarlyDate, lagLateDate))
# Get the early lags from the data
eLag <- map_dfr(.x=lagsTry, .f=helperLagCor, lf=filter(df, date<=lagMidDate), dp=depVar, id=indepVar) %>%
group_by(state) %>%
filter(p==max(p)) %>%
filter(row_number()==1) %>%
ungroup() %>%
select(state, earlyLag=lag)
# Get the late lags from the data
lLag <- map_dfr(.x=lagsTry, .f=helperLagCor, lf=filter(df, date>lagMidDate), dp=depVar, id=indepVar) %>%
group_by(state) %>%
filter(p==max(p)) %>%
filter(row_number()==1) %>%
ungroup() %>%
select(state, lateLag=lag)
# Create the full lag frame, including substituting the fixed value(s) if provided
lagFrame <- eLag %>%
inner_join(lLag, by="state")
if (!is.null(lagEarlyValue)) lagFrame <- lagFrame %>% mutate(earlyLag=lagEarlyValue)
if (!is.null(lagLateValue)) lagFrame <- lagFrame %>% mutate(lateLag=lagLateValue)
print(lagFrame)
# Apply the assumed lagging information
fullTime <- as.integer(lagLateDate-lagEarlyDate)
df <- df %>%
left_join(lagFrame, by="state") %>%
arrange(state, date) %>%
group_by(state) %>%
mutate(eLag=lag(get(indepVar), mean(earlyLag)),
lLag=lag(get(indepVar), mean(lateLag)),
pctEarly=pmin(pmax(as.integer(lagLateDate-date)/fullTime, 0), 1),
x=ifelse(is.na(eLag), NA, pctEarly*eLag + (1-pctEarly)*ifelse(is.na(lLag), 0, lLag)),
y=get(depVar),
mon=factor(month.abb[lubridate::month(date)], levels=month.abb)
) %>%
filter(!is.na(x)) %>%
ungroup()
# Regression for data from keyStates
if (length(keyStates) > 1) stateLM <- lm(y ~ x:mon:state + 0, data=df, na.action=na.exclude)
else stateLM <- lm(y ~ x:mon + 0, data=df, na.action=na.exclude)
# Add the predicted value to df
df <- df %>%
mutate(pred=predict(stateLM))
# Plot of curve overlaps
p1 <- df %>%
select(state, date, y, pred) %>%
pivot_longer(-c(state, date)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(color=c("pred"="Predicted", "y"="Actual")[name], group=name)) +
scale_x_date(date_breaks="1 month", date_labels="%b") +
labs(x="",
y=stringr::str_to_title(depTitleName),
title=paste0("Predicted vs. actual ", depTitleName)
) +
scale_color_discrete("Metric") +
facet_wrap(~state)
print(p1)
# Plot of rate by month
p2 <- coef(stateLM) %>%
as.data.frame() %>%
purrr::set_names("CFR") %>%
tibble::rownames_to_column("monState") %>%
mutate(mon=factor(stringr::str_replace_all(monState, pattern="x:mon|:state.+", replacement=""),
levels=month.abb
),
state=if (length(keyStates)==1) keyStates
else stringr::str_replace_all(monState, pattern="x:mon[A-Za-z]{3}:state", replacement="")
) %>%
left_join(lagFrame, by="state") %>%
ggplot(aes(x=mon, y=CFR)) +
geom_col(fill="lightblue") +
geom_text(aes(y=CFR/2, label=paste0(round(100*CFR, 1), "%"))) +
geom_text(data=~filter(., mon==month.abb[lubridate::month(lagMidDate)]),
aes(x=-Inf, y=Inf, label=paste0("Early Lag: ", earlyLag)),
hjust=0,
vjust=1
) +
geom_text(data=~filter(., mon==month.abb[lubridate::month(lagMidDate)]),
aes(x=Inf, y=Inf, label=paste0("Late Lag: ", lateLag)),
hjust=1,
vjust=1
) +
labs(x="",
y=paste0(stringr::str_to_title(depTitleName), " as percentage of lagged ", indepTitleName),
title=paste0(stringr::str_to_title(depTitleName),
" vs. lagged ",
indepTitleName,
" in state(s): ",
paste0(keyStates, collapse=", ")
),
subtitle=paste0("Assumed early lag on ",
lagEarlyDate,
" interpolated to late lag on ",
lagLateDate
),
caption="Linear model coefficients on lagged data with no intercept used to estimate percentage"
) +
facet_wrap(~state)
print(p2)
# Return the data frame
df
}
# Deaths vs. cases in Michigan
mwOut <- assessStateCFR(mwCurveList,
keyStates=c("MI", "IL", "IN"),
depVar="CTP_deaths",
indepVar="CTP_cases",
depTitleName="deaths",
indepTitleName="cases",
lagEarlyValue=NULL,
lagLateValue=NULL
)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(keyMetric)` instead of `keyMetric` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## # A tibble: 3 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 IN 2 2
## 2 IL 5 3
## 3 MI 10 0
# Deaths vs. cases in NY/NJ and S NE
neOut <- assessStateCFR(neCurveList,
keyStates=c("NY/NJ", "S NE"),
depVar="CTP_deaths",
indepVar="CTP_cases",
depTitleName="deaths",
indepTitleName="cases",
lagEarlyValue=NULL,
lagLateValue=NULL
)
## # A tibble: 2 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 NY/NJ 6 24
## 2 S NE 6 0
## Warning: Removed 4 rows containing missing values (position_stack).
## Warning: Removed 4 rows containing missing values (geom_text).
The CFR declines in more recent months, possibly as a function of a greater number of tests finding less serious disease. Lag times are variable but typically seem to be around a week. The use of both lag times and variable CFR by month introduces some risk of over-fitting.
The process for investigating lags and leads can also be refreshed with new data:
lagVectorWindows <- function(v1,
v2,
lagsTry=0:30,
windowSize=30,
minNoNA=ceiling(windowSize/2),
updateStatus=FALSE,
isLag=TRUE
) {
# FUNCTION ARGUMENTS:
# v1: the first vector, which will be used 'as is'
# v2: the second vector, which will be lagged/led by various values for lagsTry
# lagsTry: the values for x that will be used in cor(v1, lag/lead(v2, x))
# windowSize: the size of the window to use in taking snippets of v1 and lagged/led v2
# minNoNA: minimum number of non-NA lagged/led values needed to calculate a correlation
# updateStates: whether to print which window is being worked on
# isLag: boolean, should a lag or a lead be applied (TRUE is lag, FALSE is lead)
# Find the function to be used
func <- if (isLag) lag else lead
# Find the list of all possible window start points
windowStarts <- 1:(length(v1)-windowSize+1)
# Helper function to create a frame of correlations
helperCorr <- function(s) {
# Create the end point for the vector
e <- s + windowSize - 1
# Announce the start
if (updateStatus) cat("\nProcessing window starting at:", s)
# Create the correlations tibble for all values of lag, and return
tibble::tibble(startWindow=s, endWindow=e, lags=lagsTry) %>%
mutate(na1=sum(is.na(v1[s:e])),
na2=sapply(lags, FUN=function(x) sum(is.na(func(v2, x)[s:e]))),
p=sapply(lags,
FUN=function(x) {
ifelse(sum(!is.na(func(v2, x)[s:e])) < minNoNA,
NA,
cor(v1[s:e], func(v2, x)[s:e], use="complete.obs")
)
}
)
)
}
# Bind the correlations frames and return
map_dfr(windowStarts, .f=helperCorr)
}
# Function to assess correlations by lag/lead and window by state
stateCorr <- function(lst,
keyState,
met="vpm7",
v1Name="CTP_deaths",
v2Name="CTP_cases",
windowSize=42,
isLag=TRUE
) {
# FUNCTION ARGUMENTS:
# lst: the processed list
# keyState: the state of interest
# met: the metric of interest
# v1Name: the name of the first vector (this is considered fixed)
# v2Name: the name of the second vector (this will have the lead/lag applied to it)
# windowSize: number of days in the window
# isLag: boolean, whether to use lag (TRUE) or lead (FALSE) on v2Name
# Extract the data for the key State
df <- lst[["dfList"]] %>%
filter(state %in% keyState, !is.na(get(met))) %>%
arrange(state, date, name)
# Get the minimum date that is common to both
minDate <- df %>%
group_by(name) %>%
summarize(date=min(date), .groups="drop_last") %>%
pull(date) %>%
max()
# Extract v1 and v2
v1 <- df %>%
filter(name==v1Name, date>=minDate) %>%
pull(met)
v2 <- df %>%
filter(name==v2Name, date>=minDate) %>%
pull(met)
# Confirm that dates are the same for both vectors
dfDates1 <- df %>% filter(name==v1Name, date>=minDate) %>% pull(date)
dfDates2 <- df %>% filter(name==v2Name, date>=minDate) %>% pull(date)
if (!all.equal(dfDates1, dfDates2)) stop("\nDate mismatch\n")
# Find the lags in the data
dfLags <- lagVectorWindows(v1, v2, lagsTry=0:30, windowSize=windowSize, isLag=isLag) %>%
mutate(windowStartDate=dfDates1[startWindow])
# Give the description of the lag or lead
descr <- ifelse(isLag, "lag (days)", "lead (days)")
# Boxplot of correlations by lag
p1 <- dfLags %>%
filter(!is.na(p)) %>%
ggplot(aes(x=factor(lags), y=p)) +
geom_boxplot(fill="lightblue") +
labs(x=stringr::str_to_title(descr),
y="Correlation",
title=paste0("Box plot of correlation by ", descr)
)
print(p1)
# Plot of best lags by starting date
p2 <- dfLags %>%
filter(!is.na(p)) %>%
group_by(startWindow) %>%
filter(p==max(p)) %>%
ggplot(aes(x=windowStartDate, y=lags)) +
geom_point(aes(size=p)) +
labs(x="Window start date",
y=paste0("Best ", descr),
title=paste0("Best ", descr, " by window starting date")
) +
scale_size_continuous(paste0("p at best ", stringr::str_replace(descr, " .*", ""))) +
scale_x_date(date_breaks="1 month", date_labels="%b")
print(p2)
# Plot of correlations by lag
p3 <- dfLags %>%
filter(!is.na(p)) %>%
ggplot(aes(x=windowStartDate, y=lags)) +
geom_tile(aes(fill=p)) +
labs(x="Window start date",
y=stringr::str_to_title(descr),
title=paste0(stringr::str_to_title(descr), " by window starting date")
) +
scale_color_continuous(paste0("p at ", stringr::str_replace(descr, " .*", ""))) +
scale_x_date(date_breaks="1 month", date_labels="%b")
print(p3)
# Rename variable lags to leads if isLag is FALSE
if (isFALSE(isLag)) dfLags <- dfLags %>%
rename(leads=lags)
# Return dfLags
dfLags
}
miLeadData <- stateCorr(mwCurveList, keyState="MI", v1Name="CTP_cases", v2Name="CTP_deaths", isLag=FALSE)
nynjLeadData <- stateCorr(neCurveList, keyState="NY/NJ", v1Name="CTP_cases", v2Name="CTP_deaths", isLag=FALSE)
And the full process can be run for the state of Texas:
# Example for southern states
soCurveList <- createAndAlignCurves(stateData,
yVal="vpm7",
namesPlot=c("CTP_cases"),
namesSec=c("CTP_deaths"),
keyStates=c("AZ", "FL", "GA", "TX"),
plotTitle="2020 coronavirus burden per million per day (select states)",
plotSub="Cases on main y-axis, deaths on secondary y-axis",
primYLab="Cases per million (7-day rolling mean)",
secYLab="Deaths per million (7-day rolling mean)",
facetFixed=TRUE,
popData=usafPrepped$usafDemo,
printPlot=TRUE,
lagValueMetric="vpm7",
lagDepName="CTP_deaths",
lagsTry=0:30
)
##
## Will scale by: 0.02108297
## Warning: Removed 3 row(s) containing missing values (geom_path).
assessStateCFR(soCurveList,
keyStates=c("AZ", "FL", "GA", "TX"),
depVar="CTP_deaths",
indepVar="CTP_cases",
depTitleName="deaths",
indepTitleName="cases"
)
## # A tibble: 4 x 3
## state earlyLag lateLag
## <chr> <dbl> <dbl>
## 1 GA 10 20
## 2 TX 10 20
## 3 AZ 10 20
## 4 FL 10 20
## Warning: Removed 4 rows containing missing values (position_stack).
## Warning: Removed 4 rows containing missing values (geom_text).
## # A tibble: 967 x 13
## state date CTP_cases CTP_deaths earlyLag lateLag eLag lLag pctEarly
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AZ 2020-03-17 1.17 0 10 20 0.0837 NA 1
## 2 AZ 2020-03-18 1.92 0.0209 10 20 0.146 NA 1
## 3 AZ 2020-03-19 2.93 0.0418 10 20 0.146 NA 1
## 4 AZ 2020-03-20 5.17 0.0418 10 20 0.126 NA 1
## 5 AZ 2020-03-21 7.05 0.105 10 20 0.146 NA 1
## 6 AZ 2020-03-22 8.83 0.126 10 20 0.146 NA 1
## 7 AZ 2020-03-23 11.2 0.167 10 20 0.272 NA 1
## 8 AZ 2020-03-24 14.0 0.272 10 20 0.293 NA 1
## 9 AZ 2020-03-25 16.1 0.293 10 20 0.398 NA 1
## 10 AZ 2020-03-26 16.0 0.314 10 20 0.732 NA 1
## # ... with 957 more rows, and 4 more variables: x <dbl>, y <dbl>, mon <fct>,
## # pred <dbl>
txLeadData <- stateCorr(soCurveList, keyState="TX", v1Name="CTP_cases", v2Name="CTP_deaths", isLag=FALSE)
The process for assessing integrated data is converted to a main function:
integrateStateData <- function(stateData=NULL,
popData=NULL,
ctpList=NULL,
usafData=NULL,
cdcList=NULL,
glimpseStateData=NULL,
runAll=FALSE,
runTwoAxis=runAll,
yVal="vpm7",
var1="CTP_cases",
var2="CTP_deaths",
keyStates=sort(c(state.abb, "DC")),
combStates=vector("character", 0),
mapper=varMapper,
runCreateAlign=runAll,
lagsTry=0:30,
runCFR=runAll,
lagEarlyValue=10,
lagLateValue=20,
lagEarlyDate=as.Date("2020-03-31"),
lagMidDate=NULL,
lagLateDate=as.Date("2020-10-15")
) {
# FUNCTION ARGUMENTS:
# stateData: an integrated state-level data file (NULL means create from components)
# popData: a state-level population data file (must be passed if stateData is not NULL)
# ctpList: a processed list of COVID Tracking Project data (must be provided if stateData=NULL)
# usafData: a processed tibble of USA Facts data (must be provided if stateData=NULL)
# cdcList: a processed list of CDC All-Cause deaths data (must be provided if stateData=NULL)
# glimpseStateData: boolean, whether to glimpse the stateData file (NULL means only if from components)
# runAll: boolean, whether to set al of runTwoAxis, runCreateAlign, and runCFR all to the same value
# runTwoAxis: whether to show a plot of two metrics on two axes
# yVal: the y-value to use from stateData
# var1: the first of the two variables of interest (should be the leading component if a drives b)
# var2: the second of the two variables of interest (should be the lagging component if a drives b)
# keyStates: subset of states to use for analysis
# combStates: states that should be combined together for plotting (named vector, c("state"="newName"))
# mapper: mapping file for variables to descriptive names
# runCreateAlign: boolean, should createAndAlignCurves() be run?
# lagsTry: lag values to try (applies to both createAlignCurves and assessStateCFR)
# runCFR: boolean, should assessStateCFR be run?
# lagEarlyValue: the value for lag on lagEarlyDate, will be linearly interpolated to lagLateValue/Date
# NULL means calculate from data and may differ by state
# lagLateValue: the value for lag on lagLateDate, will be linearly interpolated from lagEarlyValue/Date
# NULL means estimate from data and may differ by state
# lagEarlyDate: the date for the earliest lagging calculation (dates before this will be at lagEarlyValue)
# lagMidDate: if lags are found from data, what midpoint should be used to split data as early vs late?
# NULL means midway between lagEarlyDate and lagLateDate
# lagLateDate: the date for the latest lagging calculation (dates after this will be at lagLateValue)
# Check that either stateData or its components have been provided
if (!is.null(stateData)) {
cat("\nA file has been passed for stateData, components will be ignored\n")
if (is.null(popData)) stop("Must also pass a popData file of state-level population\n")
if (is.null(glimpseStateData)) glimpseStateData <- FALSE
} else {
if (is.null(ctpList) | is.null(usafData) | is.null(cdcList)) {
stop("\nMust provided all of ctpList, usafData, cdcList when stateData is NULL\n")
}
cat("\nBuilding stateData from the passed components\n")
if (is.null(glimpseStateData)) glimpseStateData <- TRUE
# Create COVID Tracking Project File
ctpPrepped <- prepCTPData(ctpList)
# Create USA Facts file
usafPrepped <- prepUSAFData(usafData)
# Create an integrated state population demographics file
demoData <- ctpPrepped$ctpDemo %>%
rename(popCTP=pop) %>%
full_join(rename(usafPrepped$usafDemo, popUSAF=pop), by="state") %>%
mutate(pop=pmax(popCTP, popUSAF))
# Create CDC All-Cause File
cdcPrepped <- prepCDCData(cdcList, popData=demoData)
# Integrated state data
stateData <- ctpPrepped$ctpData %>%
bind_rows(usafPrepped$usafData) %>%
bind_rows(cdcPrepped$cdcData)
# Create popData if not provided
if (is.null(popData)) popData <- demoData %>% select(state, pop)
}
# Show summaries of stateData if requested
if (glimpseStateData) {
# Glimpse the file
glimpse(stateData)
# Show control totals
stateData %>%
group_by(name) %>%
summarize(value=sum(value, na.rm=TRUE), value7=sum(value7, na.rm=TRUE), .groups="drop_last") %>%
print()
}
# Run plotStateMetric if requested
if (runTwoAxis) {
caseDeath <- plotStateMetric(stateData,
yVal=yVal,
namesPlot=var1,
namesSec=var2,
keyStates=keyStates,
combStates=combStates,
plotTitle="2020 coronavirus burden per million per day (select states)",
plotSub="Caution that metrics are on different axes and scales",
primYLab=paste0(mapper[var1], "\n", mapper[yVal]),
secYLab=paste0(mapper[var2], "\n", mapper[yVal]),
facetFixed=TRUE,
popData=popData,
returnData=TRUE
)
} else {
caseDeath <- NULL
}
# Run createAndAlignCurves() if requested
if (runCreateAlign) {
curveList <- createAndAlignCurves(stateData,
yVal=yVal,
namesPlot=var1,
namesSec=var2,
keyStates=keyStates,
combStates=combStates,
plotTitle="2020 coronavirus burden per million per day (select states)",
plotSub="Caution that metrics are on different axes and scales",
primYLab=paste0(mapper[var1], "\n", mapper[yVal]),
secYLab=paste0(mapper[var2], "\n", mapper[yVal]),
facetFixed=TRUE,
popData=popData,
printPlot=TRUE,
lagValueMetric=yVal,
lagDepName=var2,
lagsTry=lagsTry
)
} else {
curveList <- NULL
}
# Run assessStateCFR() if requested (requires that curveList have been created)
if (runCFR & is.null(curveList)) {
cat("\nassessStateCFR requires runCreateAlign=TRUE; skipping and setting runCFR=FALSE\n")
runCFR <- FALSE
}
if (runCFR) {
# Create the list of key states based on keyStates and combStates
useStates <- keyStates
for (ctr in 1:length(useStates)) {
if (useStates[ctr] %in% names(combStates)) useStates[ctr] <- combStates[useStates[ctr]]
}
useStates <- unique(useStates)
# Run assessStateCFR
cfrList <- assessStateCFR(curveList,
keyStates=useStates,
depVar=var2,
indepVar=var1,
depTitleName=stringr::str_replace(var2, ".*_", ""),
indepTitleName=stringr::str_replace(var1, ".*_", ""),
keyMetric=yVal,
lagEarlyDate=lagEarlyDate,
lagMidDate=lagMidDate,
lagLateDate=lagLateDate,
lagEarlyValue=lagEarlyValue,
lagLateValue=lagLateValue,
lagsTry=lagsTry
)
} else {
cfrList <- NULL
}
# Return a list of the key data
list(stateData=stateData, popData=popData, caseDeath=caseDeath, curveList=curveList, cfrList=cfrList)
}
# Example for creating a full stateData file
fullStateList <- integrateStateData(ctpList=readFromRDS("test_old_201108"),
usafData=readFromRDS("cty_old_20201109")$clusterStateData,
cdcList=readFromRDS("cdcList_20201110")
)
##
## Building stateData from the passed components
## Rows: 104,103
## Columns: 9
## $ state <chr> "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", ...
## $ date <date> 2020-03-06, 2020-03-07, 2020-03-08, 2020-03-09, 2020-03-10,...
## $ metric <chr> "cases", "cases", "cases", "cases", "cases", "cases", "cases...
## $ source <chr> "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP...
## $ name <chr> "CTP_cases", "CTP_cases", "CTP_cases", "CTP_cases", "CTP_cas...
## $ value <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 3, 0, 6, 2, 8, 0, 14, 6,...
## $ value7 <dbl> NA, NA, NA, 0.0000000, 0.1428571, 0.1428571, 0.1428571, 0.14...
## $ vpm <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, ...
## $ vpm7 <dbl> NA, NA, NA, 0.0000000, 0.1934601, 0.1934601, 0.1934601, 0.19...
## # A tibble: 8 x 3
## name value value7
## <chr> <dbl> <dbl>
## 1 CDC_deaths 2224832. 2173735.
## 2 CDC_excess 242466. 240886.
## 3 CTP_cases 9717214 9372412.
## 4 CTP_deaths 228262 224869.
## 5 CTP_hosp 9301372 9036822.
## 6 CTP_tests 154892254 150735326.
## 7 USAF_cases 9679664 9336904.
## 8 USAF_deaths 233286 230182.
fullStateList
## $stateData
## # A tibble: 104,103 x 9
## state date metric source name value value7 vpm vpm7
## <chr> <date> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 AK 2020-03-06 cases CTP CTP_cases 0 NA 0 NA
## 2 AK 2020-03-07 cases CTP CTP_cases 0 NA 0 NA
## 3 AK 2020-03-08 cases CTP CTP_cases 0 NA 0 NA
## 4 AK 2020-03-09 cases CTP CTP_cases 0 0 0 0
## 5 AK 2020-03-10 cases CTP CTP_cases 0 0.143 0 0.193
## 6 AK 2020-03-11 cases CTP CTP_cases 0 0.143 0 0.193
## 7 AK 2020-03-12 cases CTP CTP_cases 0 0.143 0 0.193
## 8 AK 2020-03-13 cases CTP CTP_cases 1 0.143 1.35 0.193
## 9 AK 2020-03-14 cases CTP CTP_cases 0 0.429 0 0.580
## 10 AK 2020-03-15 cases CTP CTP_cases 0 0.857 0 1.16
## # ... with 104,093 more rows
##
## $popData
## # A tibble: 51 x 2
## state pop
## <chr> <dbl>
## 1 AK 738432
## 2 AL 4858979
## 3 AR 2978204
## 4 AZ 6828065
## 5 CA 39144818
## 6 CO 5456574
## 7 CT 3590886
## 8 DC 672228
## 9 DE 945934
## 10 FL 20271272
## # ... with 41 more rows
##
## $caseDeath
## NULL
##
## $curveList
## NULL
##
## $cfrList
## NULL
# Example for using the full stateData file
nenynjList <- integrateStateData(stateData=fullStateList$stateData,
popData=fullStateList$popData,
runAll=TRUE,
keyStates=c("NY", "NJ", "MA", "CT", "RI", "NH", "VT", "ME"),
combStates=c("MA"="S NE", "CT"="S NE", "RI"="S NE",
"NH"="N NE", "VT"="N NE", "ME"="N NE"
),
lagEarlyValue=NULL,
lagLateValue=NULL
)
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.07901078
##
## Will scale by: 0.07901078
## # A tibble: 4 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 N NE 3 27
## 2 NY 6 17
## 3 S NE 6 0
## 4 NJ 8 30
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (geom_text).
# Example for alignment between USA Facts and CTP
testList <- integrateStateData(stateData=fullStateList$stateData,
popData=fullStateList$popData,
runAll=TRUE,
keyStates=c("NY", "NJ", "MA", "CT", "RI", "NH", "VT", "ME"),
combStates=c("MA"="S NE", "CT"="S NE", "RI"="S NE",
"NH"="N NE", "VT"="N NE", "ME"="N NE"
),
var1="CTP_deaths",
var2="USAF_deaths",
lagEarlyValue=NULL,
lagLateValue=NULL
)
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 1.281759
##
## Will scale by: 1.281759
## # A tibble: 4 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 N NE 0 0
## 2 NJ 0 0
## 3 S NE 1 2
## 4 NY 2 4
## Warning: Removed 8 row(s) containing missing values (geom_path).
## Warning: Removed 8 rows containing missing values (position_stack).
## Warning: Removed 8 rows containing missing values (geom_text).
The differences in data reporting between CTP and USA Facts stand out. Lags are generally zero, though the spikes mean that each source has a different cumulative number of deaths. The process can similarly be run for the Great Lakes states (excluding Pennsylvania and New York):
# Example for using the full stateData file
glakesList <- integrateStateData(stateData=fullStateList$stateData,
popData=fullStateList$popData,
runAll=TRUE,
keyStates=c("MN", "WI", "IL", "IN", "MI", "OH"),
lagEarlyValue=NULL,
lagLateValue=NULL
)
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.01614313
##
## Will scale by: 0.01614313
## Warning: Removed 3 row(s) containing missing values (geom_path).
## # A tibble: 6 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 IN 2 2
## 2 MN 3 13
## 3 WI 3 30
## 4 IL 5 3
## 5 MI 10 0
## 6 OH 11 19
Creating separate lags by state as well as separate CFR by month is important in the Great Lakes. Michigan had a very large spike early when there was little case-death lag and a high CFR. By contrast, Wisconsin had a large late spike when there was more meaningful case-death lag and a much lower CFR. Indiana especially needs the variation by month as its early spike had ~6% CFR and its late spike had ~1% CFR.
Functions have been updated so that .groups=“drop_last” in every summarize() call to avoid the note about dplyr default behavior. Further, the lm() has been updated to na.action=na.exclude so that predict() will return NA for all rows with missing data (required for hospital data which has variable date of entry by state).
Suppose that the goal is to check how hospitalizations and deaths correlate:
# Example for using the full stateData file
glakesHospList <- integrateStateData(stateData=fullStateList$stateData,
popData=fullStateList$popData,
runAll=TRUE,
keyStates=c("MN", "WI", "IL", "IN", "MI", "OH"),
var1="CTP_hosp",
var2="CTP_deaths",
lagEarlyValue=NULL,
lagLateValue=NULL
)
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.04114457
##
## Will scale by: 0.04114457
## Warning: Removed 3 row(s) containing missing values (geom_path).
## # A tibble: 6 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 MN 0 11
## 2 MI 3 5
## 3 WI 3 15
## 4 IN 5 3
## 5 OH 9 24
## 6 IL 10 0
## Warning: Removed 7 rows containing missing values (position_stack).
## Warning: Removed 7 rows containing missing values (geom_text).
Even with total hospitalized being a state variable rather than a flow variable, it appears to be a better predicor of changes in deaths in the next week or so. Specifically, even when using a common 7-day lag across all states and dates, application of a single multiplier in the 2%-3% range by state appears to drive reasonably good convergence of the curves. It is a much more stable predictor over time than cases.
Suppose that the goal is to assess whether increases in CDC all-cause deaths link to increases in coronavirus deaths:
# Example for using the full stateData file
glakesCDC1List <- integrateStateData(stateData=fullStateList$stateData,
popData=fullStateList$popData,
runAll=TRUE,
keyStates=c("MN", "WI", "IL", "IN", "MI", "OH"),
var1="CDC_excess",
var2="CTP_deaths",
lagEarlyValue=NULL,
lagLateValue=NULL
)
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.7824127
##
## Will scale by: 0.7824127
## Warning: Removed 59 row(s) containing missing values (geom_path).
## # A tibble: 6 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 MN 0 0
## 2 OH 1 2
## 3 MI 3 7
## 4 IN 5 18
## 5 WI 7 4
## 6 IL 8 0
## Warning: Removed 110 row(s) containing missing values (geom_path).
# Example for using the full stateData file
glakesCDC2List <- integrateStateData(stateData=fullStateList$stateData,
popData=fullStateList$popData,
runAll=TRUE,
keyStates=c("MN", "WI", "IL", "IN", "MI", "OH"),
var1="CTP_deaths",
var2="CDC_excess",
lagEarlyValue=NULL,
lagLateValue=NULL
)
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 1.278098
##
## Will scale by: 1.278098
## Warning: Removed 63 row(s) containing missing values (geom_path).
## # A tibble: 6 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 IL 0 0
## 2 IN 0 26
## 3 MI 0 12
## 4 MN 0 4
## 5 OH 0 23
## 6 WI 0 0
## Warning: Removed 126 row(s) containing missing values (geom_path).
The link between CDC excess all-cause deaths and reported coronavirus deaths is less clear. There appear to be roughly 30% more excess all-cause deaths than reported coronavirus deaths, and with the excess all-cause deaths sometimes leading the reported coronavirus deaths (possibly due to differences in how death date is tracked). Further, Wisconsin did not have excess deaths until Fall, so the predictive nature of this metric is poor in Wisconsin.
Do increases in coronavirus cases predict increases in CDC all-cause deaths?
# Example for using the full stateData file
glakesCDC3List <- integrateStateData(stateData=fullStateList$stateData,
popData=fullStateList$popData,
runAll=TRUE,
keyStates=c("MN", "WI", "IL", "IN", "MI", "OH"),
var1="CTP_cases",
var2="CDC_excess",
lagEarlyValue=NULL,
lagLateValue=NULL
)
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.0206325
##
## Will scale by: 0.0206325
## Warning: Removed 66 row(s) containing missing values (geom_path).
## # A tibble: 6 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 IL 0 9
## 2 IN 0 13
## 3 MN 0 15
## 4 OH 0 29
## 5 WI 0 5
## 6 MI 7 16
## Warning: Removed 126 row(s) containing missing values (geom_path).
Since excess deaths and coronavirus deaths are reasonably correlated, increases in cases help predict increases in excess all-cause deaths. The rate of excess death per case decreases significantly with time.
The full process is updated with an additional two weeks of data:
# Use existing segments with updated data
locDownload <- "./RInputFiles/Coronavirus/CV_downloaded_201120.csv"
test_old_201120 <- readRunCOVIDTrackingProject(thruLabel="Nov 19, 2020",
downloadTo=if (file.exists(locDownload)) NULL else locDownload,
readFrom=locDownload,
compareFile=readFromRDS("test_hier5_201025")$dfRaw,
useClusters=readFromRDS("test_hier5_201025")$useClusters
)
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## state = col_character(),
## totalTestResultsSource = col_character(),
## dataQualityGrade = col_character(),
## lastUpdateEt = col_character(),
## dateModified = col_datetime(format = ""),
## checkTimeEt = col_character(),
## dateChecked = col_datetime(format = ""),
## fips = col_character(),
## hash = col_character(),
## grade = col_logical()
## )
## i Use `spec()` for the full column specifications.
##
## File is unique by state and date
##
##
## Overall control totals in file:
## # A tibble: 1 x 3
## positiveIncrease deathIncrease hospitalizedCurrently
## <dbl> <dbl> <dbl>
## 1 11556034 243675 10193684
##
## *** COMPARISONS TO REFERENCE FILE: compareFile
##
## Checkin for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checkin for similarity of: states
## In reference but not in current:
## In current but not in reference:
##
## Checkin for similarity of: dates
## In reference but not in current:
## In current but not in reference: 2020-11-19 2020-11-18 2020-11-17 2020-11-16 2020-11-15 2020-11-14 2020-11-13 2020-11-12 2020-11-11 2020-11-10 2020-11-09 2020-11-08 2020-11-07 2020-11-06 2020-11-05 2020-11-04 2020-11-03 2020-11-02 2020-11-01 2020-10-31 2020-10-30 2020-10-29 2020-10-28 2020-10-27 2020-10-26 2020-10-25
##
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("date", "name")
## date name newValue oldValue
## 1 2020-03-06 positiveIncrease 116 109
## 2 2020-03-07 positiveIncrease 165 176
## 3 2020-03-10 positiveIncrease 407 387
## 4 2020-03-11 positiveIncrease 502 509
## 5 2020-03-12 positiveIncrease 686 671
## 6 2020-03-13 positiveIncrease 1055 1072
## 7 2020-03-15 positiveIncrease 1263 1291
## 8 2020-03-16 positiveIncrease 1776 1739
## 9 2020-03-18 positiveIncrease 3037 3089
## 10 2020-03-19 positiveIncrease 4706 4651
## 11 2020-03-21 hospitalizedCurrently 1492 1436
## 12 2020-03-23 hospitalizedCurrently 2812 2770
## 13 2020-03-25 hospitalizedCurrently 5140 5062
## 14 2020-03-28 positiveIncrease 19601 19925
## 15 2020-03-28 deathIncrease 551 544
## 16 2020-03-29 deathIncrease 504 515
## 17 2020-03-30 positiveIncrease 21485 22042
## 18 2020-03-31 positiveIncrease 25174 24853
## 19 2020-03-31 deathIncrease 907 890
## 20 2020-04-01 positiveIncrease 26128 25791
## 21 2020-04-04 positiveIncrease 32867 33212
## 22 2020-04-06 positiveIncrease 28410 29002
## 23 2020-04-09 positiveIncrease 35116 34503
## 24 2020-04-10 positiveIncrease 33473 34380
## 25 2020-04-10 deathIncrease 2072 2108
## 26 2020-04-11 positiveIncrease 31092 30501
## 27 2020-04-11 deathIncrease 2079 2054
## 28 2020-04-13 positiveIncrease 24384 25195
## 29 2020-04-14 positiveIncrease 26080 25719
## 30 2020-04-15 positiveIncrease 29859 30307
## 31 2020-04-16 positiveIncrease 31581 30978
## 32 2020-04-23 deathIncrease 1814 1791
## 33 2020-04-24 deathIncrease 1972 1895
## 34 2020-04-25 deathIncrease 1627 1748
## 35 2020-04-27 deathIncrease 1287 1270
## 36 2020-04-29 deathIncrease 2685 2713
## 37 2020-05-01 deathIncrease 1808 1779
## 38 2020-05-02 deathIncrease 1531 1562
## 39 2020-05-05 deathIncrease 2496 2452
## 40 2020-05-06 deathIncrease 1915 1948
## 41 2020-05-08 deathIncrease 1780 1798
## 42 2020-05-12 positiveIncrease 22559 22890
## 43 2020-05-12 deathIncrease 1505 1486
## 44 2020-05-13 positiveIncrease 21627 21285
## 45 2020-05-13 deathIncrease 1736 1704
## 46 2020-05-14 deathIncrease 1854 1879
## 47 2020-05-15 positiveIncrease 25422 24685
## 48 2020-05-15 deathIncrease 1265 1507
## 49 2020-05-16 positiveIncrease 23593 24702
## 50 2020-05-16 deathIncrease 1195 987
## 51 2020-05-18 deathIncrease 890 848
## 52 2020-05-21 deathIncrease 1426 1394
## 53 2020-05-22 positiveIncrease 24173 24433
## 54 2020-05-22 deathIncrease 1303 1341
## 55 2020-05-23 positiveIncrease 22365 21531
## 56 2020-05-23 deathIncrease 1035 1063
## 57 2020-05-24 positiveIncrease 18859 20072
## 58 2020-05-25 deathIncrease 553 559
## 59 2020-05-26 deathIncrease 673 645
## 60 2020-05-28 deathIncrease 1245 1231
## 61 2020-05-29 deathIncrease 1167 1184
## 62 2020-05-30 positiveIncrease 23437 23682
## 63 2020-05-30 deathIncrease 917 932
## 64 2020-06-02 deathIncrease 1000 962
## 65 2020-06-03 positiveIncrease 20155 20390
## 66 2020-06-03 deathIncrease 979 993
## 67 2020-06-04 positiveIncrease 20383 20886
## 68 2020-06-04 deathIncrease 868 893
## 69 2020-06-05 positiveIncrease 23065 23394
## 70 2020-06-05 deathIncrease 840 826
## 71 2020-06-06 positiveIncrease 22560 23064
## 72 2020-06-06 deathIncrease 710 728
## 73 2020-06-08 deathIncrease 679 661
## 74 2020-06-12 positiveIncrease 23095 23597
## 75 2020-06-12 deathIncrease 763 775
## 76 2020-06-15 deathIncrease 407 381
## 77 2020-06-16 deathIncrease 707 730
## 78 2020-06-17 deathIncrease 794 767
## 79 2020-06-18 positiveIncrease 27088 27746
## 80 2020-06-18 deathIncrease 690 705
## 81 2020-06-19 positiveIncrease 30960 31471
## 82 2020-06-20 positiveIncrease 31950 32294
## 83 2020-06-20 deathIncrease 611 629
## 84 2020-06-21 positiveIncrease 28848 27928
## 85 2020-06-22 deathIncrease 295 286
## 86 2020-06-23 positiveIncrease 33884 33447
## 87 2020-06-23 deathIncrease 725 710
## 88 2020-06-24 deathIncrease 706 724
## 89 2020-06-25 deathIncrease 664 647
## 90 2020-06-26 deathIncrease 625 637
## 91 2020-06-27 deathIncrease 502 511
## 92 2020-06-29 deathIncrease 358 332
## 93 2020-06-30 deathIncrease 585 596
## 94 2020-07-01 deathIncrease 688 701
## 95 2020-07-02 positiveIncrease 53508 54085
## 96 2020-07-04 deathIncrease 300 306
## 97 2020-07-06 positiveIncrease 41494 41959
## 98 2020-07-06 deathIncrease 266 243
## 99 2020-07-07 deathIncrease 904 923
## 100 2020-07-09 deathIncrease 900 867
## 101 2020-07-10 deathIncrease 822 854
## 102 2020-07-17 deathIncrease 935 951
## 103 2020-07-20 deathIncrease 368 363
## 104 2020-07-21 deathIncrease 1070 1039
## 105 2020-07-22 deathIncrease 1136 1171
## 106 2020-07-24 deathIncrease 1190 1176
## 107 2020-07-25 deathIncrease 1008 1023
## 108 2020-07-26 positiveIncrease 60123 61000
## 109 2020-08-01 positiveIncrease 60247 61101
## 110 2020-08-02 deathIncrease 492 498
## 111 2020-08-03 deathIncrease 536 519
## 112 2020-08-04 deathIncrease 1238 1255
## 113 2020-08-08 positiveIncrease 53084 53712
## 114 2020-08-10 deathIncrease 437 426
## 115 2020-08-14 positiveIncrease 57093 55636
## 116 2020-08-17 positiveIncrease 37411 37880
## 117 2020-08-17 deathIncrease 418 407
## 118 2020-08-21 deathIncrease 1108 1123
## 119 2020-08-22 positiveIncrease 45723 46236
## 120 2020-08-24 positiveIncrease 34250 34643
## 121 2020-08-24 deathIncrease 352 343
## 122 2020-08-29 positiveIncrease 43967 44501
## 123 2020-08-31 deathIncrease 380 366
## 124 2020-09-02 positiveIncrease 30217 30603
## 125 2020-09-07 positiveIncrease 28143 28682
## 126 2020-09-09 deathIncrease 1102 1084
## 127 2020-09-15 positiveIncrease 34904 35445
## 128 2020-09-15 deathIncrease 1044 1031
## 129 2020-09-16 deathIncrease 1184 1200
## 130 2020-09-19 positiveIncrease 44905 45564
## 131 2020-09-20 positiveIncrease 35503 36295
## 132 2020-09-22 deathIncrease 866 854
## 133 2020-09-23 deathIncrease 1147 1159
## 134 2020-09-27 positiveIncrease 34987 35454
## 135 2020-09-27 deathIncrease 302 307
## 136 2020-09-28 positiveIncrease 35883 36524
## 137 2020-09-28 deathIncrease 265 257
## 138 2020-09-29 positiveIncrease 36441 36947
## 139 2020-10-03 deathIncrease 733 741
## 140 2020-10-04 positiveIncrease 37988 38439
## 141 2020-10-05 deathIncrease 341 326
## 142 2020-10-11 positiveIncrease 46269 46946
## 143 2020-10-12 positiveIncrease 42643 43124
## 144 2020-10-13 deathIncrease 700 690
## 145 2020-10-14 positiveIncrease 56117 56797
## 146 2020-10-15 deathIncrease 937 951
## 147 2020-10-16 deathIncrease 891 877
## 148 2020-10-17 positiveIncrease 57330 57943
## 149 2020-10-17 deathIncrease 762 780
## 150 2020-10-18 positiveIncrease 48284 48922
## 151 2020-10-19 deathIncrease 461 456
## 152 2020-10-21 positiveIncrease 60953 58606
## 153 2020-10-22 positiveIncrease 72842 75248
## Joining, by = c("date", "name")
## Warning: Removed 26 row(s) containing missing values (geom_path).
##
##
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("state", "name")
## state name newValue oldValue
## 1 AK positiveIncrease 12523 13535
## 2 CO positiveIncrease 93398 91570
## 3 FL positiveIncrease 766305 776249
## 4 NM positiveIncrease 41040 40168
## 5 NM hospitalizedCurrently 27399 27120
## 6 PR positiveIncrease 31067 61275
## 7 RI positiveIncrease 30581 30116
## Rows: 14,673
## Columns: 55
## $ date <date> 2020-11-19, 2020-11-19, 2020-11-19, 20...
## $ state <chr> "AK", "AL", "AR", "AS", "AZ", "CA", "CO...
## $ positive <dbl> 24909, 225910, 139855, 0, 287225, 10592...
## $ probableCases <dbl> NA, 36449, 15690, NA, 8007, NA, 9148, 6...
## $ negative <dbl> 873890, 1307256, 1428285, 1988, 1758070...
## $ pending <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestResultsSource <chr> "totalTestsViral", "totalTestsViral", "...
## $ totalTestResults <dbl> 898799, 1496717, 1552450, 1988, 2037288...
## $ hospitalizedCurrently <dbl> 139, 1315, 891, NA, 1796, 5319, 1593, 8...
## $ hospitalizedCumulative <dbl> 593, 23295, 8268, NA, 23871, NA, 11980,...
## $ inIcuCurrently <dbl> NA, NA, 353, NA, 433, 1253, NA, NA, 34,...
## $ inIcuCumulative <dbl> NA, 2182, NA, NA, NA, NA, NA, NA, NA, N...
## $ onVentilatorCurrently <dbl> 14, NA, 143, NA, 227, NA, NA, NA, 10, N...
## $ onVentilatorCumulative <dbl> NA, 1261, 925, NA, NA, NA, NA, NA, NA, ...
## $ recovered <dbl> 7165, 90702, 120545, NA, 46951, NA, 103...
## $ dataQualityGrade <chr> "A", "A", "A+", "D", "A+", "B", "A", "C...
## $ lastUpdateEt <chr> "11/19/2020 03:59", "11/19/2020 11:00",...
## $ dateModified <dttm> 2020-11-19 03:59:00, 2020-11-19 11:00:...
## $ checkTimeEt <chr> "11/18 22:59", "11/19 06:00", "11/18 19...
## $ death <dbl> 101, 3419, 2297, 0, 6384, 18466, 2350, ...
## $ hospitalized <dbl> 593, 23295, 8268, NA, 23871, NA, 11980,...
## $ dateChecked <dttm> 2020-11-19 03:59:00, 2020-11-19 11:00:...
## $ totalTestsViral <dbl> 898799, 1496717, 1552450, 1988, NA, 215...
## $ positiveTestsViral <dbl> 29798, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ negativeTestsViral <dbl> 868482, NA, 1428285, NA, NA, NA, NA, NA...
## $ positiveCasesViral <dbl> NA, 189461, 124165, 0, 279218, 1059267,...
## $ deathConfirmed <dbl> 101, 3123, 2105, NA, 5919, NA, NA, 3862...
## $ deathProbable <dbl> NA, 296, 192, NA, 465, NA, NA, 943, NA,...
## $ totalTestEncountersViral <dbl> NA, NA, NA, NA, NA, NA, 2695700, NA, 60...
## $ totalTestsPeopleViral <dbl> NA, NA, NA, NA, 2037288, NA, 1537359, N...
## $ totalTestsAntibody <dbl> NA, NA, NA, NA, 354092, NA, 194983, NA,...
## $ positiveTestsAntibody <dbl> NA, NA, NA, NA, NA, NA, 15141, NA, NA, ...
## $ negativeTestsAntibody <dbl> NA, NA, NA, NA, NA, NA, 179842, NA, NA,...
## $ totalTestsPeopleAntibody <dbl> NA, 68847, NA, NA, NA, NA, NA, NA, NA, ...
## $ positiveTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ negativeTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestsPeopleAntigen <dbl> NA, NA, 107231, NA, NA, NA, NA, NA, NA,...
## $ positiveTestsPeopleAntigen <dbl> NA, NA, 18437, NA, NA, NA, NA, NA, NA, ...
## $ totalTestsAntigen <dbl> NA, NA, 21856, NA, NA, NA, NA, 26512, N...
## $ positiveTestsAntigen <dbl> NA, NA, 3300, NA, NA, NA, NA, NA, NA, N...
## $ fips <chr> "02", "01", "05", "60", "04", "06", "08...
## $ positiveIncrease <dbl> 490, 2424, 2238, 0, 4123, 11478, 6107, ...
## $ negativeIncrease <dbl> 12751, 13211, 12983, 0, 14417, 122507, ...
## $ total <dbl> 898799, 1533166, 1568140, 1988, 2045295...
## $ totalTestResultsIncrease <dbl> 13241, 15049, 14667, 0, 18293, 133985, ...
## $ posNeg <dbl> 898799, 1533166, 1568140, 1988, 2045295...
## $ deathIncrease <dbl> 1, 72, 22, 0, 19, 106, 26, 21, 2, 0, 81...
## $ hospitalizedIncrease <dbl> 9, 207, 129, 0, 398, 0, 178, 0, 0, 0, 2...
## $ hash <chr> "dd60caa3c0b65aa928204156ef405a95116f3f...
## $ commercialScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeRegularScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ positiveScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ score <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ grade <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
##
##
## Control totals - note that validState other than TRUE will be discarded
##
## # A tibble: 2 x 6
## validState cases deaths hosp tests n
## <lgl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 FALSE 52410 1108 NA 474990 1245
## 2 TRUE 11503624 242567 NA 173236764 13428
## Rows: 13,428
## Columns: 6
## $ date <date> 2020-11-19, 2020-11-19, 2020-11-19, 2020-11-19, 2020-11-19,...
## $ state <chr> "AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", ...
## $ cases <dbl> 490, 2424, 2238, 4123, 11478, 6107, 2353, 213, 398, 8882, 27...
## $ deaths <dbl> 1, 72, 22, 19, 106, 26, 21, 2, 0, 81, 37, 0, 40, 14, 180, 59...
## $ hosp <dbl> 139, 1315, 891, 1796, 5319, 1593, 840, 127, 165, 3380, 2142,...
## $ tests <dbl> 13241, 15049, 14667, 18293, 133985, 55085, 36596, 6422, 1216...
## Rows: 13,428
## Columns: 14
## $ date <date> 2020-01-22, 2020-01-22, 2020-01-23, 2020-01-23, 2020-01-24,...
## $ state <chr> "MA", "WA", "MA", "WA", "MA", "WA", "MA", "WA", "MA", "WA", ...
## $ cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hosp <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tests <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ cpm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ dpm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hpm <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm <dbl> 0.0000000, 0.0000000, 0.1471796, 0.0000000, 0.0000000, 0.000...
## $ cpm7 <dbl> NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, NA...
## $ dpm7 <dbl> NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, NA...
## $ hpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm7 <dbl> NA, NA, NA, NA, NA, NA, 0.04205130, 0.00000000, 0.06307695, ...
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'date', 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
##
## Recency is defined as 2020-10-21 through current
##
## Recency is defined as 2020-10-21 through current
## `summarise()` regrouping output by 'state', 'cluster', 'date' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
saveToRDS(test_old_201120, ovrWriteError=FALSE)
##
## File already exists: ./RInputFiles/Coronavirus/test_old_201120.RDS
##
## Not replacing the existing file since ovrWrite=FALSE
## NULL
# Locations for the population, case, and death file
popLoc <- "./RInputFiles/Coronavirus/covid_county_population_usafacts.csv"
caseLoc <- "./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20201120.csv"
deathLoc <- "./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201120.csv"
# Run old segments against new data
cty_old_20201120 <- readRunUSAFacts(maxDate="2020-11-18",
popLoc=popLoc,
caseLoc=caseLoc,
deathLoc=deathLoc,
dlCaseDeath=!(file.exists(caseLoc) & file.exists(deathLoc)),
oldFile=readFromRDS("cty_20201026")$dfBurden,
existingCountyClusters=readFromRDS("cty_20201026")$clustVec
)
##
## -- Column specification --------------------------------------------------------
## cols(
## countyFIPS = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## population = col_double()
## )
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## i Use `spec()` for the full column specifications.
## Rows: 964,890
## Columns: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumCases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## i Use `spec()` for the full column specifications.
## Rows: 964,890
## Columns: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumDeaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## `geom_smooth()` using formula 'y ~ x'
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
##
## Shapes will be created without any floor on the number of cases per million
## Shapes will be created without any floor on the number of deaths per million
## *** Counties with 0 cases/deaths or that fall below the floor for minCase/minDeath ***
## # A tibble: 1 x 4
## cpm_mean_is0 dpm_mean_is0 dpm_mean_ltDeath cpm_mean_ltCase
## <dbl> <dbl> <dbl> <dbl>
## 1 0 0.00817 0 0
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'date', 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
##
## Recency is defined as 2020-10-20 through current
##
## Recency is defined as 2020-10-20 through current
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## Joining, by = "fipsCounty"
## Joining, by = "fipsCounty"
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
saveToRDS(cty_old_20201120, ovrWriteError=FALSE)
##
## File already exists: ./RInputFiles/Coronavirus/cty_old_20201120.RDS
##
## Not replacing the existing file since ovrWrite=FALSE
## NULL
There appears to be a county-level anomaly for FIPS 22053 showing 18,544 cases diagnosed on 18-NOV in a county with population ~30,000 people. The process should be updated to allow for excluding spurious data points.
# Use data that have previously been downloaded
cdcLoc <- "Weekly_counts_of_deaths_by_jurisdiction_and_age_group_downloaded_20201120.csv"
cdcList_20201120 <- readRunCDCAllCause(loc=cdcLoc,
startYear=2015,
curYear=2020,
weekThru=38,
startWeek=9,
lst=readFromRDS("test_old_201120"),
epiMap=readFromRDS("epiMonth"),
agePopData=readFromRDS("usPopBucket2020"),
cvDeathThru="2020-09-19",
cdcPlotStartWeek=10,
dlData=!file.exists(paste0("./RInputFiles/Coronavirus/", cdcLoc)),
stateNoCheck=c("NC")
)
## Rows: 179,663
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "A...
## $ `Week Ending Date` <chr> "01/10/2015", "01/17/2015", "01/24/2015", "01/...
## $ `State Abbreviation` <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"...
## $ Year <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015...
## $ Week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ `Age Group` <chr> "25-44 years", "25-44 years", "25-44 years", "...
## $ `Number of Deaths` <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50...
## $ `Time Period` <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2...
## $ Type <chr> "Predicted (weighted)", "Predicted (weighted)"...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 179,663
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <chr> "25-44 years", "25-44 years", "25-44 years", "25-44 ye...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2019", "2...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
## Check Control Levels and Record Counts for Renamed Data:
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
## age n n_deaths_na deaths
## <chr> <int> <int> <dbl>
## 1 25-44 years 26907 4 3293947
## 2 45-64 years 32860 15 12885563
## 3 65-74 years 32851 16 12792037
## 4 75-84 years 32870 19 15899092
## 5 85 years and older 32856 19 20718264
## 6 Under 25 years 21319 2 1416502
## `summarise()` regrouping output by 'period', 'year' (override with `.groups` argument)
## # A tibble: 12 x 6
## # Groups: period, year [6]
## period year type n n_deaths_na deaths
## <chr> <int> <chr> <int> <int> <dbl>
## 1 2015-2019 2015 Predicted (weighted) 15285 0 5416391
## 2 2015-2019 2015 Unweighted 15285 0 5416391
## 3 2015-2019 2016 Predicted (weighted) 15365 0 5483764
## 4 2015-2019 2016 Unweighted 15365 0 5483764
## 5 2015-2019 2017 Predicted (weighted) 15318 0 5643347
## 6 2015-2019 2017 Unweighted 15318 0 5643347
## 7 2015-2019 2018 Predicted (weighted) 15307 0 5698022
## 8 2015-2019 2018 Unweighted 15307 0 5698022
## 9 2015-2019 2019 Predicted (weighted) 15318 0 5725502
## 10 2015-2019 2019 Unweighted 15318 0 5725502
## 11 2020 2020 Predicted (weighted) 13260 41 5584252
## 12 2020 2020 Unweighted 13217 34 5487101
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 3 x 5
## # Groups: period [2]
## period Suppress n n_deaths_na deaths
## <chr> <chr> <int> <int> <dbl>
## 1 2015-2019 <NA> 153186 0 5.59e7
## 2 2020 Suppressed (counts highly incomplete, <5~ 75 75 0.
## 3 2020 <NA> 26402 0 1.11e7
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 9 x 5
## # Groups: period [2]
## period Note n n_deaths_na deaths
## <chr> <chr> <int> <int> <dbl>
## 1 2015-20~ <NA> 153186 0 5.59e7
## 2 2020 Data in recent weeks are incomplete. Only~ 21043 34 9.13e6
## 3 2020 Data in recent weeks are incomplete. Only~ 444 0 2.04e5
## 4 2020 Data in recent weeks are incomplete. Only~ 339 22 4.45e4
## 5 2020 Data in recent weeks are incomplete. Only~ 2241 19 7.10e5
## 6 2020 Data in recent weeks are incomplete. Only~ 48 0 2.62e4
## 7 2020 Estimates for Pennsylvania are too low fo~ 48 0 2.26e4
## 8 2020 Weights may be too low to account for und~ 312 0 1.16e5
## 9 2020 <NA> 2002 0 8.22e5
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## state Jurisdiction n n_deaths_na deaths
## 1 US United States 3660 0 33382812
## 2 CA California 3660 0 3175134
## 3 FL Florida 3660 0 2439960
## 4 TX Texas 3660 0 2405990
## 5 PA Pennsylvania 3660 0 1600457
## 6 OH Ohio 3660 0 1443190
## 7 IL Illinois 3660 0 1260836
## 8 NY New York 3660 0 1191716
## 9 MI Michigan 3660 0 1148050
## 10 NC North Carolina 3573 33 1068292
## 11 GA Georgia 3659 0 1002322
## 12 NJ New Jersey 3654 0 892684
## 13 TN Tennessee 3660 0 872960
## 14 VA Virginia 3660 0 802114
## 15 IN Indiana 3658 0 776857
## 16 MO Missouri 3656 0 755398
## 17 AZ Arizona 3660 0 707602
## 18 MA Massachusetts 3624 0 705559
## 19 YC New York City 3656 0 689474
## 20 WA Washington 3658 10 664276
## 21 AL Alabama 3658 0 620268
## 22 WI Wisconsin 3640 0 614607
## 23 MD Maryland 3654 0 589675
## 24 SC South Carolina 3658 0 582138
## 25 KY Kentucky 3621 0 564915
## 26 LA Louisiana 3655 0 545500
## 27 MN Minnesota 3612 0 521445
## 28 CO Colorado 3657 0 462126
## 29 OK Oklahoma 3649 0 461666
## 30 OR Oregon 3490 0 427650
## 31 MS Mississippi 3595 0 376968
## 32 AR Arkansas 3552 0 375477
## 33 CT Connecticut 3215 17 367731
## 34 IA Iowa 3293 0 352522
## 35 PR Puerto Rico 3372 0 343055
## 36 KS Kansas 3348 0 307450
## 37 NV Nevada 3394 0 299546
## 38 WV West Virginia 3098 11 258798
## 39 UT Utah 3546 0 221678
## 40 NM New Mexico 3231 0 212208
## 41 NE Nebraska 2942 0 195453
## 42 ME Maine 2732 0 165642
## 43 ID Idaho 2856 0 158780
## 44 NH New Hampshire 2751 0 139437
## 45 HI Hawaii 2642 0 128673
## 46 RI Rhode Island 2553 0 117919
## 47 MT Montana 2638 0 114177
## 48 DE Delaware 2645 0 102858
## 49 SD South Dakota 2522 4 89560
## 50 ND North Dakota 2512 0 78477
## 51 DC District of Columbia 2630 0 66270
## 52 VT Vermont 2416 0 64000
## 53 WY Wyoming 2395 0 49201
## 54 AK Alaska 2433 0 43852
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## Rows: 179,663
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 87,863
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
##
## *** Data suppression checks ***
## # A tibble: 14 x 11
## Jurisdiction weekEnding state year week age deaths period type Suppress
## <chr> <date> <chr> <fct> <int> <fct> <dbl> <fct> <chr> <chr>
## 1 North Carol~ 2020-09-05 NC 2020 36 25-4~ NA 2020 Pred~ Suppres~
## 2 North Carol~ 2020-09-05 NC 2020 36 45-6~ NA 2020 Pred~ Suppres~
## 3 North Carol~ 2020-09-12 NC 2020 37 45-6~ NA 2020 Pred~ Suppres~
## 4 North Carol~ 2020-09-19 NC 2020 38 45-6~ NA 2020 Pred~ Suppres~
## 5 North Carol~ 2020-09-05 NC 2020 36 65-7~ NA 2020 Pred~ Suppres~
## 6 North Carol~ 2020-09-12 NC 2020 37 65-7~ NA 2020 Pred~ Suppres~
## 7 North Carol~ 2020-09-19 NC 2020 38 65-7~ NA 2020 Pred~ Suppres~
## 8 North Carol~ 2020-09-05 NC 2020 36 75-8~ NA 2020 Pred~ Suppres~
## 9 North Carol~ 2020-09-12 NC 2020 37 75-8~ NA 2020 Pred~ Suppres~
## 10 North Carol~ 2020-09-19 NC 2020 38 75-8~ NA 2020 Pred~ Suppres~
## 11 North Carol~ 2020-09-05 NC 2020 36 85 y~ NA 2020 Pred~ Suppres~
## 12 North Carol~ 2020-09-12 NC 2020 37 85 y~ NA 2020 Pred~ Suppres~
## 13 North Carol~ 2020-09-19 NC 2020 38 85 y~ NA 2020 Pred~ Suppres~
## 14 North Carol~ 2020-09-05 NC 2020 36 Unde~ NA 2020 Pred~ Suppres~
## # ... with 1 more variable: Note <chr>
##
## *** Data suppression checks failed - total of 14 suppressions
## *** Of these suppressions, 10 are NOT from weekThru of current year
## Continuing since all states with problems are in stateNoCheck
## `summarise()` regrouping output by 'Jurisdiction', 'weekEnding', 'state', 'year', 'week', 'age', 'period', 'type' (override with `.groups` argument)
## Rows: 82,639
## Columns: 12
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, ...
## $ age <fct> Under 25 years, 25-44 years, 45-64 years, 65-74 years,...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ n <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ deaths <dbl> 25, 67, 253, 202, 272, 320, 28, 49, 256, 222, 253, 332...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
## First duplicate is in row number (0 means no duplicates): 0
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year', 'week' (override with `.groups` argument)
## `summarise()` regrouping output by 'year', 'week' (override with `.groups` argument)
## `summarise()` regrouping output by 'year', 'age', 'week' (override with `.groups` argument)
##
## Plots will be run after excluding stateNoCheck states
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'state', 'quarter', 'month' (override with `.groups` argument)
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "state"
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'age', 'quarter', 'month' (override with `.groups` argument)
## `summarise()` regrouping output by 'age' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
saveToRDS(cdcList_20201120, ovrWriteError=FALSE)
##
## File already exists: ./RInputFiles/Coronavirus/cdcList_20201120.RDS
##
## Not replacing the existing file since ovrWrite=FALSE
## NULL
# Example for creating a full stateData file
fullStateList_20201120 <- integrateStateData(ctpList=readFromRDS("test_old_201120"),
usafData=readFromRDS("cty_old_20201120")$clusterStateData,
cdcList=readFromRDS("cdcList_20201120")
)
##
## Building stateData from the passed components
## Rows: 109,123
## Columns: 9
## $ state <chr> "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", ...
## $ date <date> 2020-03-06, 2020-03-07, 2020-03-08, 2020-03-09, 2020-03-10,...
## $ metric <chr> "cases", "cases", "cases", "cases", "cases", "cases", "cases...
## $ source <chr> "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP...
## $ name <chr> "CTP_cases", "CTP_cases", "CTP_cases", "CTP_cases", "CTP_cas...
## $ value <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 5, 3, 3, 1, 10, 13, 4, 7...
## $ value7 <dbl> NA, NA, NA, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.00...
## $ vpm <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, ...
## $ vpm7 <dbl> NA, NA, NA, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.00...
## # A tibble: 8 x 3
## name value value7
## <chr> <dbl> <dbl>
## 1 CDC_deaths 2340646. 2289614.
## 2 CDC_excess 255796. 254403.
## 3 CTP_cases 11503624 11013435.
## 4 CTP_deaths 242567 238041
## 5 CTP_hosp 10124707 9790131.
## 6 CTP_tests 173236764 168394308.
## 7 USAF_cases 11269520 10812984.
## 8 USAF_deaths 245506 241695.
fullStateList_20201120
## $stateData
## # A tibble: 109,123 x 9
## state date metric source name value value7 vpm vpm7
## <chr> <date> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 AK 2020-03-06 cases CTP CTP_cases 0 NA 0 NA
## 2 AK 2020-03-07 cases CTP CTP_cases 0 NA 0 NA
## 3 AK 2020-03-08 cases CTP CTP_cases 0 NA 0 NA
## 4 AK 2020-03-09 cases CTP CTP_cases 0 0 0 0
## 5 AK 2020-03-10 cases CTP CTP_cases 0 0 0 0
## 6 AK 2020-03-11 cases CTP CTP_cases 0 0 0 0
## 7 AK 2020-03-12 cases CTP CTP_cases 0 0 0 0
## 8 AK 2020-03-13 cases CTP CTP_cases 0 0 0 0
## 9 AK 2020-03-14 cases CTP CTP_cases 0 0.429 0 0.580
## 10 AK 2020-03-15 cases CTP CTP_cases 0 1.14 0 1.55
## # ... with 109,113 more rows
##
## $popData
## # A tibble: 51 x 2
## state pop
## <chr> <dbl>
## 1 AK 738432
## 2 AL 4858979
## 3 AR 2978204
## 4 AZ 6828065
## 5 CA 39144818
## 6 CO 5456574
## 7 CT 3590886
## 8 DC 672228
## 9 DE 945934
## 10 FL 20271272
## # ... with 41 more rows
##
## $caseDeath
## NULL
##
## $curveList
## NULL
##
## $cfrList
## NULL
# Example for using the full stateData file
glakesList_20201120 <- integrateStateData(stateData=fullStateList_20201120$stateData,
popData=fullStateList_20201120$popData,
runAll=TRUE,
keyStates=c("MN", "WI", "IL", "IN", "MI", "OH"),
lagEarlyValue=NULL,
lagLateValue=NULL
)
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.01264923
##
## Will scale by: 0.01264923
## Warning: Removed 3 row(s) containing missing values (geom_path).
## # A tibble: 6 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 IN 2 0
## 2 MN 3 0
## 3 WI 3 29
## 4 IL 5 6
## 5 MI 10 2
## 6 OH 11 22
Data can be run on some of the states currently experiencing heavy outbreaks, with NY, MI, AZ as reference states that previously had high spikes:
# Example for using the full stateData file
plainsList_20201120 <- integrateStateData(stateData=fullStateList_20201120$stateData,
popData=fullStateList_20201120$popData,
runAll=TRUE,
keyStates=c("MN", "ND", "SD", "MT", "NE",
"KS", "IA", "MI", "NY", "AZ"
),
combStates=c("ND"="Dakotas", "SD"="Dakotas"),
lagEarlyValue=NULL,
lagLateValue=NULL
)
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.02194667
##
## Will scale by: 0.02194667
## # A tibble: 9 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 KS 0 23
## 2 NE 0 12
## 3 MN 3 0
## 4 NY 6 2
## 5 IA 8 4
## 6 MI 10 2
## 7 Dakotas 19 20
## 8 AZ 21 19
## 9 MT 22 18
## Warning: Removed 10 rows containing missing values (position_stack).
## Warning: Removed 10 rows containing missing values (geom_text).
# Example for using hospitalizations
plainsListHosp_20201120 <- integrateStateData(stateData=fullStateList_20201120$stateData,
popData=fullStateList_20201120$popData,
runAll=TRUE,
var1="CTP_hosp",
keyStates=c("MN", "ND", "SD", "MT", "NE",
"IA", "MI", "NY", "AZ"
),
combStates=c("ND"="Dakotas", "SD"="Dakotas"),
lagEarlyValue=NULL,
lagLateValue=NULL
)
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.04109452
##
## Will scale by: 0.04109452
## # A tibble: 8 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 MN 0 11
## 2 NY 0 0
## 3 Dakotas 1 6
## 4 IA 1 7
## 5 MI 3 4
## 6 MT 3 10
## 7 AZ 26 12
## 8 NE 27 12
## Warning: Removed 9 rows containing missing values (position_stack).
## Warning: Removed 9 rows containing missing values (geom_text).
Two issues should be addressed, both related to x-variable and lag:
The assessStateCFR() function is updated to use a ceiling value as the plotted estimate any time an estimate exceeds the ceiling, and to set a lag of 0 if there is insufficient data for running correlations:
# Updated for automatic lag time assessment
assessStateCFR <- function(lst,
keyStates,
depVar,
indepVar,
depTitleName,
indepTitleName,
keyMetric="vpm7",
lagEarlyDate=as.Date("2020-03-31"),
lagMidDate=NULL,
lagLateDate=as.Date("2020-10-15"),
lagEarlyValue=10,
lagLateValue=20,
lagsTry=0:30,
maxCFR=0.2
) {
# FUNCTION ARGUMENTS:
# lst: A list such as produced by createAndAlignCurves()
# keyStates: The key states to be extracted from the list
# depVar: the dependent variable
# indepVar: the independent variable
# depTitleName: the name for the dependent variable in the title
# indepTitleName: the name for the independent variable in the plot title
# keyMetric: the name of the key metric that is being assessed
# lagEarlyDate: the date for the earliest lagging calculation (dates before this will be at lagEarlyValue)
# lagMidDate: if lags are found from data, what midpoint should be used to split data as early vs late?
# NULL means midway between lagEarlyDate and lagLateDate
# lagLateDate: the date for the latest lagging calculation (dates after this will be at lagLateValue)
# lagEarlyValue: the value for lag on lagEarlyDate, will be linearly interpolated to lagLateValue/Date
# NULL means calculate from data and may differ by state
# lagLateValue: the value for lag on lagLateDate, will be linearly interpolated from lagEarlyValue/Date
# NULL means estimate from data and may differ by state
# lagsTry: the values for lag to be attempted if lageEarlyValue and/or lagLateValue is NULL
# maxCFR: the maximum CFR to use (anything above will be scaled back to this ceiling value)
# Extract the data for keyStates
df <- lst[["dfList"]] %>%
select_at(vars(all_of(c("state", "date", "name", keyMetric)))) %>%
filter(state %in% keyStates, !is.na(get(keyMetric))) %>%
pivot_wider(names_from="name", values_from=keyMetric)
# Function for finding lag time correlations
helperLagCor <- function(lt, lf, dp, id) {
allStates <- lf %>%
count(state) %>%
select(-n)
lf %>%
group_by(state) %>%
mutate(y=get(dp), x=lag(get(id), lt)) %>%
filter(!is.na(y), !is.na(x)) %>%
summarize(p=cor(x, y, use="complete.obs"), .groups="drop_last") %>%
ungroup() %>%
right_join(allStates, by="state") %>%
mutate(lag=ifelse(is.na(lt), 0, lt))
}
# Middle date for splitting data
if (is.null(lagMidDate)) lagMidDate <- mean(c(lagEarlyDate, lagLateDate))
# Get the early lags from the data
eLag <- map_dfr(.x=lagsTry, .f=helperLagCor, lf=filter(df, date<=lagMidDate), dp=depVar, id=indepVar) %>%
group_by(state) %>%
filter(p==max(p)) %>%
filter(row_number()==1) %>%
ungroup() %>%
select(state, earlyLag=lag)
# Get the late lags from the data
lLag <- map_dfr(.x=lagsTry, .f=helperLagCor, lf=filter(df, date>lagMidDate), dp=depVar, id=indepVar) %>%
group_by(state) %>%
filter(p==max(p)) %>%
filter(row_number()==1) %>%
ungroup() %>%
select(state, lateLag=lag)
# Create the full lag frame, including substituting the fixed value(s) if provided
lagFrame <- eLag %>%
full_join(lLag, by="state") %>%
mutate(earlyLag=ifelse(is.na(earlyLag), 0, earlyLag), lateLag=ifelse(is.na(lateLag), 0, lateLag))
if (!is.null(lagEarlyValue)) lagFrame <- lagFrame %>% mutate(earlyLag=lagEarlyValue)
if (!is.null(lagLateValue)) lagFrame <- lagFrame %>% mutate(lateLag=lagLateValue)
print(lagFrame)
# Apply the assumed lagging information
fullTime <- as.integer(lagLateDate-lagEarlyDate)
df <- df %>%
left_join(lagFrame, by="state") %>%
arrange(state, date) %>%
group_by(state) %>%
mutate(eLag=lag(get(indepVar), mean(earlyLag)),
lLag=lag(get(indepVar), mean(lateLag)),
pctEarly=pmin(pmax(as.integer(lagLateDate-date)/fullTime, 0), 1),
x=ifelse(is.na(eLag), NA, pctEarly*eLag + (1-pctEarly)*ifelse(is.na(lLag), 0, lLag)),
y=get(depVar),
mon=factor(month.abb[lubridate::month(date)], levels=month.abb)
) %>%
filter(!is.na(x)) %>%
ungroup()
# Regression for data from keyStates
if (length(keyStates) > 1) stateLM <- lm(y ~ x:mon:state + 0, data=df, na.action=na.exclude)
else stateLM <- lm(y ~ x:mon + 0, data=df, na.action=na.exclude)
# Add the predicted value to df
df <- df %>%
mutate(pred=predict(stateLM))
# Plot of curve overlaps
p1 <- df %>%
select(state, date, y, pred) %>%
pivot_longer(-c(state, date)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes(color=c("pred"="Predicted", "y"="Actual")[name], group=name)) +
scale_x_date(date_breaks="1 month", date_labels="%b") +
labs(x="",
y=stringr::str_to_title(depTitleName),
title=paste0("Predicted vs. actual ", depTitleName)
) +
scale_color_discrete("Metric") +
facet_wrap(~state)
print(p1)
# Plot of rate by month
p2 <- coef(stateLM) %>%
as.data.frame() %>%
purrr::set_names("CFR") %>%
filter(!is.na(CFR)) %>%
tibble::rownames_to_column("monState") %>%
mutate(mon=factor(stringr::str_replace_all(monState, pattern="x:mon|:state.+", replacement=""),
levels=month.abb
),
state=if (length(keyStates)==1) keyStates
else stringr::str_replace_all(monState, pattern="x:mon[A-Za-z]{3}:state", replacement=""),
colorUse=ifelse(CFR>maxCFR, "red", "lightblue"),
CFR=pmin(CFR, maxCFR)
) %>%
left_join(lagFrame, by="state") %>%
ggplot(aes(x=mon, y=CFR)) +
geom_col(aes(fill=colorUse)) +
geom_text(aes(y=CFR/2, label=paste0(round(100*CFR, 1), "%"))) +
geom_text(data=~filter(., mon==month.abb[lubridate::month(lagMidDate)]),
aes(x=-Inf, y=Inf, label=paste0("Early Lag: ", earlyLag)),
hjust=0,
vjust=1
) +
geom_text(data=~filter(., mon==month.abb[lubridate::month(lagMidDate)]),
aes(x=Inf, y=Inf, label=paste0("Late Lag: ", lateLag)),
hjust=1,
vjust=1
) +
labs(x="",
y=paste0(stringr::str_to_title(depTitleName), " as percentage of lagged ", indepTitleName),
title=paste0(stringr::str_to_title(depTitleName),
" vs. lagged ",
indepTitleName,
" in state(s): ",
paste0(keyStates, collapse=", ")
),
subtitle=paste0("Assumed early lag on ",
lagEarlyDate,
" interpolated to late lag on ",
lagLateDate,
"\nValues above ",
maxCFR,
" reported as ",
maxCFR,
" and flagged in red"
),
caption="Linear model coefficients on lagged data with no intercept used to estimate percentage"
) +
scale_fill_identity() +
facet_wrap(~state)
print(p2)
# Return the data frame
df
}
The updated function is then run:
# Example for using the full stateData file
plainsList_20201120 <- integrateStateData(stateData=fullStateList_20201120$stateData,
popData=fullStateList_20201120$popData,
runAll=TRUE,
keyStates=c("MN", "ND", "SD", "MT", "NE",
"KS", "IA", "MI", "NY", "FL"
),
combStates=c("ND"="Dakotas", "SD"="Dakotas"),
lagEarlyValue=NULL,
lagLateValue=NULL
)
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.02194667
##
## Will scale by: 0.02194667
## # A tibble: 9 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 KS 0 23
## 2 NE 0 12
## 3 MN 3 0
## 4 NY 6 2
## 5 IA 8 4
## 6 MI 10 2
## 7 Dakotas 19 20
## 8 MT 22 18
## 9 FL 23 19
# Example for using hospitalizations
plainsListHosp_20201120 <- integrateStateData(stateData=fullStateList_20201120$stateData,
popData=fullStateList_20201120$popData,
runAll=TRUE,
var1="CTP_hosp",
keyStates=c("MN", "ND", "SD", "MT", "NE",
"IA", "MI", "NY", "FL"
),
combStates=c("ND"="Dakotas", "SD"="Dakotas"),
lagEarlyValue=NULL,
lagLateValue=NULL
)
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.04109452
##
## Will scale by: 0.04109452
## # A tibble: 8 x 3
## state earlyLag lateLag
## <chr> <dbl> <int>
## 1 Dakotas 0 6
## 2 MN 0 11
## 3 NY 0 0
## 4 IA 1 7
## 5 MI 3 4
## 6 MT 3 10
## 7 NE 27 12
## 8 FL 0 13
The issues appear to be addressed, with the function now running as expected.
The process can be run using the state clusters:
# Get the cluster file
clustData <- readFromRDS("test_hier5_201025")$useClusters
clustStates <- names(clustData)
clustData <- paste0("Cluster ", clustData)
names(clustData) <- clustStates
# Example for using the full stateData file
clustList_20201120 <- integrateStateData(stateData=fullStateList_20201120$stateData,
popData=fullStateList_20201120$popData,
runAll=TRUE,
keyStates=clustStates,
combStates=clustData,
lagEarlyValue=NULL,
lagLateValue=NULL
)
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.04979147
##
## Will scale by: 0.04979147
## # A tibble: 5 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 Cluster 3 2 23
## 2 Cluster 5 4 11
## 3 Cluster 4 6 19
## 4 Cluster 1 12 13
## 5 Cluster 2 23 22
# Example for using hospitalizations
clustListHosp_20201120 <- integrateStateData(stateData=fullStateList_20201120$stateData,
popData=fullStateList_20201120$popData,
runAll=TRUE,
var1="CTP_hosp",
keyStates=clustStates,
combStates=clustData,
lagEarlyValue=NULL,
lagLateValue=NULL
)
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.04054712
##
## Will scale by: 0.04054712
## # A tibble: 5 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 Cluster 1 1 0
## 2 Cluster 3 1 12
## 3 Cluster 4 1 6
## 4 Cluster 2 5 14
## 5 Cluster 5 7 2
A function is written to run the component states for a given cluster:
# Function to run the process for all states in specified cluster
regionComponents <- function(lst,
clustVec,
clustNum,
combStates=vector("character", 0),
lagEarlyValue=NULL,
lagLateValue=NULL,
returnData=FALSE,
...
)
{
# FUNCTION ARGUMENTS:
# lst: a processed list containing burden data and population data
# clustVec: a named vector containing the assignment for each state
# clustNum: cluster number to use OR a specific state to key on (that state's cluster will be used)
# combStates: states that should be combined together for plotting (named vector, c("state"="newName"))
# lagEarlyValue: can force an early lag by using an integer (NULL means estimate from data)
# lagLateValue: can force an early lag by using an integer (NULL means estimate from data)
# returnData: whether to return the data files produced
# ...: other arguments to pass to integrateStateData()
# Get the states to run from clustVec
# If an integer, pull the states directly; if a character, pull states matching the state name passed
if ("numeric" %in% class(clustNum) | "integer" %in% class(clustNum)) {
keyStates <- names(clustVec)[clustVec %in% clustNum]
} else if ("character" %in% class(clustNum)) {
clustUse <- unique(clustVec[names(clustVec) %in% clustNum])
keyStates <- names(clustVec)[clustVec %in% clustUse]
} else {
stop("\nCannot determine the desired states to plot, investigate and re-run\n")
}
# Run for cases vs. deaths
caseDeathList <- integrateStateData(stateData=lst$stateData,
popData=lst$popData,
runAll=TRUE,
keyStates=keyStates,
combStates=combStates,
lagEarlyValue=lagEarlyValue,
lagLateValue=lagLateValue,
...
)
# Run for hospitalized vs. deaths
hospDeathList <- integrateStateData(stateData=lst$stateData,
popData=lst$popData,
runAll=TRUE,
var1="CTP_hosp",
keyStates=keyStates,
combStates=combStates,
lagEarlyValue=lagEarlyValue,
lagLateValue=lagLateValue,
...
)
# Return data if requested
if (returnData) list(caseDeathList=caseDeathList, hospDeathList=hospDeathList)
}
The function is then run for the NY/NJ cluster:
regionComponents(fullStateList_20201120,
clustVec=readFromRDS("test_hier5_201025")$useClusters,
clustNum=c("NY", "NJ")
)
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.07191544
##
## Will scale by: 0.07191544
## Warning: Removed 3 row(s) containing missing values (geom_path).
## # A tibble: 4 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 CT 5 14
## 2 MA 6 1
## 3 NY 6 2
## 4 NJ 8 25
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.04109452
##
## Will scale by: 0.04109452
## Warning: Removed 1 row(s) containing missing values (geom_path).
## # A tibble: 4 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 MA 0 0
## 2 NJ 0 11
## 3 NY 0 0
## 4 CT 3 11
The function is then run for the southern cluster:
regionComponents(fullStateList_20201120,
clustVec=readFromRDS("test_hier5_201025")$useClusters,
clustNum=c("FL", "AZ")
)
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.01892897
##
## Will scale by: 0.01892897
## Warning: Removed 3 row(s) containing missing values (geom_path).
## # A tibble: 7 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 NV 8 26
## 2 AL 10 2
## 3 GA 12 20
## 4 TX 16 15
## 5 AZ 21 19
## 6 FL 23 19
## 7 SC 26 30
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.02398844
##
## Will scale by: 0.02398844
## Warning: Removed 3 row(s) containing missing values (geom_path).
## # A tibble: 7 x 3
## state earlyLag lateLag
## <chr> <dbl> <int>
## 1 AL 0 0
## 2 NV 5 15
## 3 TX 7 22
## 4 AZ 26 12
## 5 SC 26 8
## 6 GA 30 19
## 7 FL 0 13
The function is then run for the other early northern cluster:
regionComponents(fullStateList_20201120,
clustVec=readFromRDS("test_hier5_201025")$useClusters,
clustNum=c("MI")
)
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.0187579
##
## Will scale by: 0.0187579
## Warning: Removed 3 row(s) containing missing values (geom_path).
## # A tibble: 10 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 MD 0 8
## 2 DE 1 26
## 3 IN 2 0
## 4 DC 4 25
## 5 IL 5 6
## 6 MI 10 2
## 7 LA 11 19
## 8 RI 11 4
## 9 PA 12 17
## 10 MS 15 26
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.02716523
##
## Will scale by: 0.02716523
## Warning: Removed 3 row(s) containing missing values (geom_path).
## # A tibble: 10 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 DC 0 8
## 2 DE 0 19
## 3 MD 0 3
## 4 LA 1 11
## 5 RI 1 20
## 6 MI 3 4
## 7 IN 5 6
## 8 IL 10 2
## 9 PA 10 0
## 10 MS 25 10
The function is then run for the states impacted later:
regionComponents(fullStateList_20201120,
clustVec=readFromRDS("test_hier5_201025")$useClusters,
clustNum=c("ND")
)
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.01257989
##
## Will scale by: 0.01257989
## Warning: Removed 3 row(s) containing missing values (geom_path).
## # A tibble: 25 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 KS 0 23
## 2 KY 0 20
## 3 NC 0 26
## 4 NE 0 12
## 5 VA 0 16
## 6 AK 3 3
## 7 MN 3 0
## 8 OR 3 14
## 9 WI 3 29
## 10 AR 6 0
## # ... with 15 more rows
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.03541542
##
## Will scale by: 0.03541542
## Warning: Removed 3 row(s) containing missing values (geom_path).
## # A tibble: 25 x 3
## state earlyLag lateLag
## <chr> <dbl> <int>
## 1 KY 0 10
## 2 MN 0 11
## 3 NC 0 10
## 4 NM 0 8
## 5 OR 0 11
## 6 VA 0 30
## 7 IA 1 7
## 8 AR 2 0
## 9 WV 2 1
## 10 CA 3 27
## # ... with 15 more rows
This segment should likely be split further, both due to size and to meaningful differences by state in disease spike in November.
The function is then run for the handful of outlier states:
regionComponents(fullStateList_20201120,
clustVec=readFromRDS("test_hier5_201025")$useClusters,
clustNum=c("CO")
)
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.008039316
##
## Will scale by: 0.008039316
## Warning: Removed 3 row(s) containing missing values (geom_path).
## # A tibble: 5 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 CO 0 23
## 2 ME 6 7
## 3 NH 11 27
## 4 VT 11 25
## 5 WA 11 16
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.02771792
##
## Will scale by: 0.02771792
## Warning: Removed 3 row(s) containing missing values (geom_path).
## # A tibble: 5 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 CO 0 11
## 2 VT 2 13
## 3 ME 4 2
## 4 NH 6 10
## 5 WA 16 4
Data are also produced to show the early outlier region and low outlier region together:
regionComponents(fullStateList_20201120,
clustVec=readFromRDS("test_hier5_201025")$useClusters,
clustNum=c("CO", "NY")
)
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.04185309
##
## Will scale by: 0.04185309
## Warning: Removed 3 row(s) containing missing values (geom_path).
## # A tibble: 9 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 CO 0 23
## 2 CT 5 14
## 3 MA 6 1
## 4 ME 6 7
## 5 NY 6 2
## 6 NJ 8 25
## 7 NH 11 27
## 8 VT 11 25
## 9 WA 11 16
##
## A file has been passed for stateData, components will be ignored
##
## Will scale by: 0.04109452
##
## Will scale by: 0.04109452
## Warning: Removed 1 row(s) containing missing values (geom_path).
## # A tibble: 9 x 3
## state earlyLag lateLag
## <chr> <int> <int>
## 1 CO 0 11
## 2 MA 0 0
## 3 NJ 0 11
## 4 NY 0 0
## 5 VT 2 13
## 6 CT 3 11
## 7 ME 4 2
## 8 NH 6 10
## 9 WA 16 4
New data from COVID Tracking Project are downloaded, and new state-level segments are created using similar business rules as previous:
# Use existing segments with updated data
locDownload <- "./RInputFiles/Coronavirus/CV_downloaded_201130.csv"
test_hier5_201130 <- readRunCOVIDTrackingProject(thruLabel="Nov 29, 2020",
downloadTo=if(file.exists(locDownload)) NULL else locDownload,
readFrom=locDownload,
compareFile=readFromRDS("test_hier5_201025")$dfRaw,
hierarchical=TRUE,
reAssignState=list("VT"="ME"),
kCut=6,
minShape=3,
ratioDeathvsCase = 5,
ratioTotalvsShape = 0.25,
minDeath=100,
minCase=10000
)
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## state = col_character(),
## totalTestResultsSource = col_character(),
## dataQualityGrade = col_character(),
## lastUpdateEt = col_character(),
## dateModified = col_datetime(format = ""),
## checkTimeEt = col_character(),
## dateChecked = col_datetime(format = ""),
## fips = col_character(),
## hash = col_character(),
## grade = col_logical()
## )
## i Use `spec()` for the full column specifications.
##
## File is unique by state and date
##
##
## Overall control totals in file:
## # A tibble: 1 x 3
## positiveIncrease deathIncrease hospitalizedCurrently
## <dbl> <dbl> <dbl>
## 1 13188675 257920 11071946
##
## *** COMPARISONS TO REFERENCE FILE: compareFile
##
## Checkin for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checkin for similarity of: states
## In reference but not in current:
## In current but not in reference:
##
## Checkin for similarity of: dates
## In reference but not in current:
## In current but not in reference: 2020-11-29 2020-11-28 2020-11-27 2020-11-26 2020-11-25 2020-11-24 2020-11-23 2020-11-22 2020-11-21 2020-11-20 2020-11-19 2020-11-18 2020-11-17 2020-11-16 2020-11-15 2020-11-14 2020-11-13 2020-11-12 2020-11-11 2020-11-10 2020-11-09 2020-11-08 2020-11-07 2020-11-06 2020-11-05 2020-11-04 2020-11-03 2020-11-02 2020-11-01 2020-10-31 2020-10-30 2020-10-29 2020-10-28 2020-10-27 2020-10-26 2020-10-25
##
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("date", "name")
## date name newValue oldValue
## 1 2020-03-05 positiveIncrease 86 103
## 2 2020-03-06 positiveIncrease 127 109
## 3 2020-03-07 positiveIncrease 133 176
## 4 2020-03-09 positiveIncrease 285 292
## 5 2020-03-10 positiveIncrease 439 387
## 6 2020-03-11 positiveIncrease 503 509
## 7 2020-03-12 positiveIncrease 742 671
## 8 2020-03-13 positiveIncrease 952 1072
## 9 2020-03-14 positiveIncrease 982 924
## 10 2020-03-15 positiveIncrease 1189 1291
## 11 2020-03-16 positiveIncrease 1849 1739
## 12 2020-03-17 positiveIncrease 2246 2588
## 13 2020-03-18 positiveIncrease 3364 3089
## 14 2020-03-19 positiveIncrease 4705 4651
## 15 2020-03-21 hospitalizedCurrently 1492 1436
## 16 2020-03-23 hospitalizedCurrently 2812 2770
## 17 2020-03-25 hospitalizedCurrently 5140 5062
## 18 2020-03-28 positiveIncrease 19599 19925
## 19 2020-03-28 deathIncrease 551 544
## 20 2020-03-29 deathIncrease 504 515
## 21 2020-03-30 positiveIncrease 21485 22042
## 22 2020-03-31 positiveIncrease 25174 24853
## 23 2020-03-31 deathIncrease 907 890
## 24 2020-04-01 positiveIncrease 26128 25791
## 25 2020-04-04 positiveIncrease 32867 33212
## 26 2020-04-06 positiveIncrease 28410 29002
## 27 2020-04-09 positiveIncrease 35116 34503
## 28 2020-04-10 positiveIncrease 33473 34380
## 29 2020-04-10 deathIncrease 2072 2108
## 30 2020-04-11 positiveIncrease 31092 30501
## 31 2020-04-11 deathIncrease 2079 2054
## 32 2020-04-13 positiveIncrease 24384 25195
## 33 2020-04-14 positiveIncrease 26080 25719
## 34 2020-04-15 positiveIncrease 29859 30307
## 35 2020-04-16 positiveIncrease 31581 30978
## 36 2020-04-23 deathIncrease 1814 1791
## 37 2020-04-24 deathIncrease 1972 1895
## 38 2020-04-25 deathIncrease 1627 1748
## 39 2020-04-27 deathIncrease 1287 1270
## 40 2020-04-29 deathIncrease 2685 2713
## 41 2020-05-01 deathIncrease 1808 1779
## 42 2020-05-02 deathIncrease 1531 1562
## 43 2020-05-05 deathIncrease 2496 2452
## 44 2020-05-06 deathIncrease 1915 1948
## 45 2020-05-07 positiveIncrease 27224 27537
## 46 2020-05-08 deathIncrease 1780 1798
## 47 2020-05-12 positiveIncrease 22559 22890
## 48 2020-05-12 deathIncrease 1505 1486
## 49 2020-05-13 positiveIncrease 21627 21285
## 50 2020-05-13 deathIncrease 1736 1704
## 51 2020-05-14 deathIncrease 1854 1879
## 52 2020-05-15 positiveIncrease 25422 24685
## 53 2020-05-15 deathIncrease 1265 1507
## 54 2020-05-16 positiveIncrease 23593 24702
## 55 2020-05-16 deathIncrease 1195 987
## 56 2020-05-18 deathIncrease 890 848
## 57 2020-05-21 deathIncrease 1426 1394
## 58 2020-05-22 positiveIncrease 24173 24433
## 59 2020-05-22 deathIncrease 1303 1341
## 60 2020-05-23 positiveIncrease 22365 21531
## 61 2020-05-23 deathIncrease 1035 1063
## 62 2020-05-24 positiveIncrease 18859 20072
## 63 2020-05-25 deathIncrease 553 559
## 64 2020-05-26 deathIncrease 673 645
## 65 2020-05-28 deathIncrease 1245 1231
## 66 2020-05-29 deathIncrease 1167 1184
## 67 2020-05-30 positiveIncrease 23437 23682
## 68 2020-05-30 deathIncrease 917 932
## 69 2020-06-02 deathIncrease 1000 962
## 70 2020-06-03 positiveIncrease 20155 20390
## 71 2020-06-03 deathIncrease 979 993
## 72 2020-06-04 positiveIncrease 20383 20886
## 73 2020-06-04 deathIncrease 868 893
## 74 2020-06-05 positiveIncrease 23065 23394
## 75 2020-06-05 deathIncrease 840 826
## 76 2020-06-06 positiveIncrease 22560 23064
## 77 2020-06-06 deathIncrease 710 728
## 78 2020-06-08 deathIncrease 679 661
## 79 2020-06-12 positiveIncrease 23095 23597
## 80 2020-06-12 deathIncrease 763 775
## 81 2020-06-15 deathIncrease 407 381
## 82 2020-06-16 deathIncrease 707 730
## 83 2020-06-17 deathIncrease 794 767
## 84 2020-06-18 positiveIncrease 27088 27746
## 85 2020-06-18 deathIncrease 690 705
## 86 2020-06-19 positiveIncrease 30960 31471
## 87 2020-06-20 positiveIncrease 31950 32294
## 88 2020-06-20 deathIncrease 611 629
## 89 2020-06-21 positiveIncrease 28848 27928
## 90 2020-06-22 deathIncrease 295 286
## 91 2020-06-23 positiveIncrease 33884 33447
## 92 2020-06-23 deathIncrease 725 710
## 93 2020-06-24 deathIncrease 706 724
## 94 2020-06-25 deathIncrease 664 647
## 95 2020-06-26 deathIncrease 625 637
## 96 2020-06-27 deathIncrease 502 511
## 97 2020-06-29 deathIncrease 358 332
## 98 2020-06-30 deathIncrease 585 596
## 99 2020-07-01 deathIncrease 688 701
## 100 2020-07-02 positiveIncrease 53508 54085
## 101 2020-07-04 deathIncrease 300 306
## 102 2020-07-06 positiveIncrease 41494 41959
## 103 2020-07-06 deathIncrease 266 243
## 104 2020-07-07 deathIncrease 904 923
## 105 2020-07-09 deathIncrease 900 867
## 106 2020-07-10 deathIncrease 822 854
## 107 2020-07-17 deathIncrease 935 951
## 108 2020-07-20 deathIncrease 368 363
## 109 2020-07-21 deathIncrease 1070 1039
## 110 2020-07-22 deathIncrease 1136 1171
## 111 2020-07-24 deathIncrease 1190 1176
## 112 2020-07-25 deathIncrease 1008 1023
## 113 2020-07-26 positiveIncrease 60123 61000
## 114 2020-08-01 positiveIncrease 60247 61101
## 115 2020-08-02 deathIncrease 492 498
## 116 2020-08-03 deathIncrease 536 519
## 117 2020-08-04 deathIncrease 1238 1255
## 118 2020-08-08 positiveIncrease 53084 53712
## 119 2020-08-10 deathIncrease 437 426
## 120 2020-08-14 positiveIncrease 57093 55636
## 121 2020-08-17 positiveIncrease 37411 37880
## 122 2020-08-17 deathIncrease 418 407
## 123 2020-08-21 deathIncrease 1108 1123
## 124 2020-08-22 positiveIncrease 45723 46236
## 125 2020-08-24 positiveIncrease 34250 34643
## 126 2020-08-24 deathIncrease 352 343
## 127 2020-08-29 positiveIncrease 43967 44501
## 128 2020-08-31 deathIncrease 380 366
## 129 2020-09-02 positiveIncrease 30217 30603
## 130 2020-09-07 positiveIncrease 28143 28682
## 131 2020-09-09 deathIncrease 1102 1084
## 132 2020-09-15 positiveIncrease 34904 35445
## 133 2020-09-15 deathIncrease 1044 1031
## 134 2020-09-16 deathIncrease 1184 1200
## 135 2020-09-19 positiveIncrease 44905 45564
## 136 2020-09-20 positiveIncrease 35503 36295
## 137 2020-09-22 deathIncrease 866 854
## 138 2020-09-23 deathIncrease 1147 1159
## 139 2020-09-27 positiveIncrease 34987 35454
## 140 2020-09-27 deathIncrease 302 307
## 141 2020-09-28 positiveIncrease 35883 36524
## 142 2020-09-28 deathIncrease 265 257
## 143 2020-09-29 positiveIncrease 36441 36947
## 144 2020-10-03 deathIncrease 733 741
## 145 2020-10-04 positiveIncrease 37988 38439
## 146 2020-10-05 deathIncrease 341 326
## 147 2020-10-11 positiveIncrease 46269 46946
## 148 2020-10-12 positiveIncrease 42643 43124
## 149 2020-10-13 deathIncrease 700 690
## 150 2020-10-14 positiveIncrease 56117 56797
## 151 2020-10-15 deathIncrease 937 951
## 152 2020-10-16 deathIncrease 891 877
## 153 2020-10-17 positiveIncrease 57330 57943
## 154 2020-10-17 deathIncrease 762 780
## 155 2020-10-18 positiveIncrease 48284 48922
## 156 2020-10-19 deathIncrease 461 456
## 157 2020-10-21 positiveIncrease 60953 58606
## 158 2020-10-22 positiveIncrease 72842 75248
## Joining, by = c("date", "name")
## Warning: Removed 36 row(s) containing missing values (geom_path).
##
##
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("state", "name")
## state name newValue oldValue
## 1 AK positiveIncrease 12523 13535
## 2 CO positiveIncrease 93398 91570
## 3 FL positiveIncrease 766305 776249
## 4 NM positiveIncrease 41040 40168
## 5 NM hospitalizedCurrently 27399 27120
## 6 PR positiveIncrease 31067 61275
## 7 RI positiveIncrease 30581 30116
## Rows: 15,241
## Columns: 55
## $ date <date> 2020-11-29, 2020-11-29, 2020-11-29, 20...
## $ state <chr> "AK", "AL", "AR", "AS", "AZ", "CA", "CO...
## $ positive <dbl> 30816, 247229, 156247, 0, 325995, 11989...
## $ probableCases <dbl> NA, 41286, 19157, NA, 9989, NA, 10126, ...
## $ negative <dbl> 975364, 1373770, 1538738, 1988, 1920319...
## $ pending <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestResultsSource <chr> "totalTestsViral", "totalTestsViral", "...
## $ totalTestResults <dbl> 1006180, 1579713, 1675828, 1988, 223632...
## $ hospitalizedCurrently <dbl> 159, 1609, 1030, NA, 2458, 8198, 1847, ...
## $ hospitalizedCumulative <dbl> 722, 24670, 8843, NA, 25568, NA, 13428,...
## $ inIcuCurrently <dbl> NA, NA, 390, NA, 573, 1823, NA, NA, 42,...
## $ inIcuCumulative <dbl> NA, 2234, NA, NA, NA, NA, NA, NA, NA, N...
## $ onVentilatorCurrently <dbl> 27, NA, 185, NA, 356, NA, NA, NA, 21, N...
## $ onVentilatorCumulative <dbl> NA, 1287, 975, NA, NA, NA, NA, NA, NA, ...
## $ recovered <dbl> 7165, 161946, 136872, NA, 51911, NA, 12...
## $ dataQualityGrade <chr> "A", "A", "A+", "D", "A+", "B", "A", "C...
## $ lastUpdateEt <chr> "11/29/2020 03:59", "11/29/2020 11:00",...
## $ dateModified <dttm> 2020-11-29 03:59:00, 2020-11-29 11:00:...
## $ checkTimeEt <chr> "11/28 22:59", "11/29 06:00", "11/28 19...
## $ death <dbl> 121, 3577, 2470, 0, 6634, 19121, 2521, ...
## $ hospitalized <dbl> 722, 24670, 8843, NA, 25568, NA, 13428,...
## $ dateChecked <dttm> 2020-11-29 03:59:00, 2020-11-29 11:00:...
## $ totalTestsViral <dbl> 1006180, 1579713, 1675828, 1988, NA, 23...
## $ positiveTestsViral <dbl> 38165, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ negativeTestsViral <dbl> 966992, NA, 1538738, NA, NA, NA, NA, NA...
## $ positiveCasesViral <dbl> NA, 205943, 137090, 0, 316006, 1198934,...
## $ deathConfirmed <dbl> 121, 3245, 2265, NA, 6148, NA, NA, 3981...
## $ deathProbable <dbl> NA, 332, 205, NA, 486, NA, NA, 980, NA,...
## $ totalTestEncountersViral <dbl> NA, NA, NA, NA, NA, NA, 3175126, NA, 69...
## $ totalTestsPeopleViral <dbl> NA, NA, NA, NA, 2236325, NA, 1737952, N...
## $ totalTestsAntibody <dbl> NA, NA, NA, NA, 363824, NA, 206288, NA,...
## $ positiveTestsAntibody <dbl> NA, NA, NA, NA, NA, NA, 16879, NA, NA, ...
## $ negativeTestsAntibody <dbl> NA, NA, NA, NA, NA, NA, 188172, NA, NA,...
## $ totalTestsPeopleAntibody <dbl> NA, 71698, NA, NA, NA, NA, NA, NA, NA, ...
## $ positiveTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ negativeTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestsPeopleAntigen <dbl> NA, NA, 135709, NA, NA, NA, NA, NA, NA,...
## $ positiveTestsPeopleAntigen <dbl> NA, NA, 23208, NA, NA, NA, NA, NA, NA, ...
## $ totalTestsAntigen <dbl> NA, NA, 21856, NA, NA, NA, NA, 49816, N...
## $ positiveTestsAntigen <dbl> NA, NA, 3300, NA, NA, NA, NA, NA, NA, N...
## $ fips <chr> "02", "01", "05", "60", "04", "06", "08...
## $ positiveIncrease <dbl> 612, 2236, 1221, 0, 3221, 15614, 3489, ...
## $ negativeIncrease <dbl> 6514, 3978, 9224, 0, 15300, 218705, 904...
## $ total <dbl> 1006180, 1620999, 1694985, 1988, 224631...
## $ totalTestResultsIncrease <dbl> 7126, 5811, 10243, 0, 18441, 234319, 31...
## $ posNeg <dbl> 1006180, 1620999, 1694985, 1988, 224631...
## $ deathIncrease <dbl> 0, 5, 21, 0, 10, 32, 0, 0, 2, 7, 59, 18...
## $ hospitalizedIncrease <dbl> 1, 0, 24, 0, 220, 0, 59, 0, 0, 0, 116, ...
## $ hash <chr> "81a1922227c01f54d1d8cc7e718af55ee8b680...
## $ commercialScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeRegularScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ positiveScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ score <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ grade <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
##
##
## Control totals - note that validState other than TRUE will be discarded
##
## # A tibble: 2 x 6
## validState cases deaths hosp tests n
## <lgl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 FALSE 60159 1231 NA 488050 1295
## 2 TRUE 13128516 256689 NA 190653499 13946
## Rows: 13,946
## Columns: 6
## $ date <date> 2020-11-29, 2020-11-29, 2020-11-29, 2020-11-29, 2020-11-29,...
## $ state <chr> "AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", ...
## $ cases <dbl> 612, 2236, 1221, 3221, 15614, 3489, 0, 140, 581, 7131, 1665,...
## $ deaths <dbl> 0, 5, 21, 10, 32, 0, 0, 2, 7, 59, 18, 4, 7, 4, 44, 22, 0, 11...
## $ hosp <dbl> 159, 1609, 1030, 2458, 8198, 1847, 1017, 145, 211, 4059, 249...
## $ tests <dbl> 7126, 5811, 10243, 18441, 234319, 31899, 0, 5004, 9274, 8509...
## Rows: 13,946
## Columns: 14
## $ date <date> 2020-01-22, 2020-01-22, 2020-01-23, 2020-01-23, 2020-01-24,...
## $ state <chr> "MA", "WA", "MA", "WA", "MA", "WA", "MA", "WA", "MA", "WA", ...
## $ cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hosp <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tests <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ cpm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ dpm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hpm <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm <dbl> 0.0000000, 0.0000000, 0.1471796, 0.0000000, 0.0000000, 0.000...
## $ cpm7 <dbl> NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, NA...
## $ dpm7 <dbl> NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, NA...
## $ hpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm7 <dbl> NA, NA, NA, NA, NA, NA, 0.04205130, 0.00000000, 0.06307695, ...
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'date', 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
##
## Recency is defined as 2020-10-31 through current
##
## Recency is defined as 2020-10-31 through current
## `summarise()` regrouping output by 'state', 'cluster', 'date' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
saveToRDS(test_hier5_201130, ovrWriteError=FALSE)
The state-level segments are compared for general overlap:
stateSegmentChange <- tibble::tibble(state=names(test_hier5_201130$useClusters),
newSegment=unname(test_hier5_201130$useClusters),
oldSegment=unname(test_old_201108$useClusters),
ckState=names(test_old_201108$useClusters)
)
stateSegmentChange %>%
summarize(wrongState=sum(state != ckState))
## # A tibble: 1 x 1
## wrongState
## <int>
## 1 0
stateSegmentChange %>%
count(oldSegment, newSegment) %>%
ggplot(aes(x=fct_reorder(factor(oldSegment), n, .fun=sum),
y=fct_reorder(factor(newSegment), n, .fun=sum)
)
) +
geom_tile(aes(fill=n)) +
geom_text(aes(label=n)) +
coord_flip() +
scale_fill_continuous("# States", low="white", high="green", limits=c(0, NA)) +
labs(title="State segment movement", x="Original Segment", y="New Segment")
With the segments able to incorporate the October-November spike, there is significant splitting of segments based on the degree to which they are impacted by the more recent increases.
Further, plots are made for the disease evolution by state for each of the segment overlaps:
stateSegChangePlotData <- test_hier5_201130$consolidatedPlotData %>%
ungroup() %>%
select(state, date, name, pop, vpm7) %>%
filter(!is.na(vpm7)) %>%
inner_join(select(stateSegmentChange, state, newSegment, oldSegment), by=c("state")) %>%
bind_rows(mutate(., state="TOTAL")) %>%
group_by(state, date, name, newSegment, oldSegment) %>%
summarize(vpm7=sum(vpm7*pop)/sum(pop), pop=sum(pop), .groups="drop")
newSegLevels <- stateSegChangePlotData %>% count(newSegment) %>% arrange(n) %>% pull(newSegment)
oldSegLevels <- stateSegChangePlotData %>% count(oldSegment) %>% arrange(-n) %>% pull(oldSegment)
for (keyVar in c("deaths", "cases", "hosp")) {
p1 <- stateSegChangePlotData %>%
mutate(newSegment=factor(newSegment, levels=newSegLevels),
oldSegment=factor(oldSegment, levels=oldSegLevels)
) %>%
filter(name==keyVar) %>%
ggplot(aes(x=date, y=vpm7)) +
geom_line(data=~filter(., state != "TOTAL"), aes(group=state), color="grey") +
geom_line(data=~filter(., state == "TOTAL"), aes(group=1, color=newSegment)) +
facet_grid(oldSegment ~ newSegment) +
scale_x_date(date_breaks="1 months", date_labels="%b") +
scale_color_discrete("New\nSegment") +
theme(axis.text.x = element_text(angle = 90)) +
labs(x="",
y="Per million (7-day rolling mean)",
title=paste0(stringr::str_to_title(keyVar), " per million per day by segment overlap")
)
print(p1)
}
New data from USA Facts are downloaded and new county-level segments are created using similar business rules as previous:
# Locations for the population, case, and death file
popLoc <- "./RInputFiles/Coronavirus/covid_county_population_usafacts.csv"
caseLoc <- "./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20201203.csv"
deathLoc <- "./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201203.csv"
# Run old segments against new data
cty_new_20201203 <- readRunUSAFacts(maxDate="2020-12-02",
popLoc=popLoc,
caseLoc=caseLoc,
deathLoc=deathLoc,
dlCaseDeath=!(file.exists(caseLoc) & file.exists(deathLoc)),
oldFile=readFromRDS("cty_20201026")$dfBurden,
existingStateClusters=test_hier5_201130$useClusters,
createClusters=TRUE,
hierarchical=NA,
minShape=4,
maxShape=11,
ratioDeathvsCase=5,
ratioTotalvsShape=0.25,
minDeath=100,
minCase=5000,
hmlSegs=3,
eslSegs=3,
seed=2012040236,
orderCluster="dpm"
)
##
## -- Column specification --------------------------------------------------------
## cols(
## countyFIPS = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## population = col_double()
## )
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## i Use `spec()` for the full column specifications.
## Rows: 1,009,620
## Columns: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumCases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## i Use `spec()` for the full column specifications.
## Rows: 1,009,620
## Columns: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumDeaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## `geom_smooth()` using formula 'y ~ x'
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
##
## Shape curves will impose a floor of at least 5000 cases per million
## Shape curves will impose a floor of at least 100 deaths per million
## *** Counties with 0 cases/deaths or that fall below the floor for minCase/minDeath ***
## # A tibble: 1 x 4
## cpm_mean_is0 dpm_mean_is0 dpm_mean_ltDeath cpm_mean_ltCase
## <dbl> <dbl> <dbl> <dbl>
## 1 0 0.00377 0.0421 0.00566
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'date', 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
##
## Recency is defined as 2020-11-03 through current
##
## Recency is defined as 2020-11-03 through current
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## Joining, by = "fipsCounty"
## Joining, by = "fipsCounty"
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
saveToRDS(cty_new_20201203, ovrWriteError=FALSE)
The overlap of county segments is calculated and plotted:
oldCountySegment <- cty_old_20201120$helperACC_county %>%
select(state, oldSegment=cluster) %>%
unique()
countySegmentChange <- cty_new_20201203$helperACC_county %>%
select(state, newSegment=cluster) %>%
unique() %>%
full_join(oldCountySegment, by="state") %>%
mutate(state=stringr::str_pad(state, width=5, side="left", pad="0"))
countySegmentChange %>%is.na() %>% colSums()
## state newSegment oldSegment
## 0 0 0
countySegmentChange %>%
count(oldSegment, newSegment) %>%
ggplot(aes(x=fct_reorder(factor(oldSegment), n, .fun=sum),
y=fct_reorder(factor(newSegment), n, .fun=sum)
)
) +
geom_tile(aes(fill=n)) +
geom_text(aes(label=n)) +
coord_flip() +
scale_fill_continuous("# Counties", low="white", high="green", limits=c(0, NA)) +
labs(title="County segment movement", x="Original Segment", y="New Segment")
fisher.test(select(countySegmentChange, oldSegment, newSegment) %>% table(), simulate.p.value=TRUE)
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 2000 replicates)
##
## data: select(countySegmentChange, oldSegment, newSegment) %>% table()
## p-value = 0.0004998
## alternative hypothesis: two.sided
chisq.test(select(countySegmentChange, oldSegment, newSegment) %>% table())
## Warning in chisq.test(select(countySegmentChange, oldSegment, newSegment) %>% :
## Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: select(countySegmentChange, oldSegment, newSegment) %>% table()
## X-squared = 1707.8, df = 64, p-value < 2.2e-16
There is meaningful overlap between old county segment and new county segment. Plots of disease burden by county segment overlap are also created:
countySegChangePlotData <- cty_new_20201203$clusterStateData %>%
ungroup() %>%
select(fipsCounty, date, countyName, state, pop, cpm7, dpm7) %>%
filter(!is.na(cpm7), !is.na(dpm7)) %>%
inner_join(select(countySegmentChange, fipsCounty=state, newSegment, oldSegment), by=c("fipsCounty")) %>%
bind_rows(mutate(., fipsCounty="TOTAL", state="TOTAL", countyName="TOTAL")) %>%
group_by(fipsCounty, date, countyName, state, newSegment, oldSegment) %>%
summarize(cpm7=sum(cpm7*pop)/sum(pop), dpm7=sum(dpm7*pop)/sum(pop), pop=sum(pop), .groups="drop")
newSegLevels <- countySegChangePlotData %>% count(newSegment) %>% arrange(n) %>% pull(newSegment)
oldSegLevels <- countySegChangePlotData %>% count(oldSegment) %>% arrange(-n) %>% pull(oldSegment)
for (keyVar in c("cpm7", "dpm7")) {
p1 <- countySegChangePlotData %>%
mutate(newSegment=factor(newSegment, levels=newSegLevels),
oldSegment=factor(oldSegment, levels=oldSegLevels)
) %>%
ggplot(aes(x=date, y=get(keyVar))) +
# geom_line(data=~filter(., state != "TOTAL"), aes(group=fipsCounty), color="grey") +
geom_line(data=~filter(., state == "TOTAL"), aes(group=1, color=newSegment)) +
facet_grid(oldSegment ~ newSegment) +
scale_x_date(date_breaks="1 months", date_labels="%b") +
scale_color_discrete("New\nSegment") +
theme(axis.text.x = element_text(angle = 90)) +
labs(x="",
y="Per million (7-day rolling mean)",
title=paste0(stringr::str_to_title(keyVar), " per million per day by segment overlap")
)
print(p1)
}
Further, updated CDC data are downloaded:
# Download new data
cdcLoc <- "Weekly_counts_of_deaths_by_jurisdiction_and_age_group_downloaded_20201206.csv"
cdcList_20201206 <- readRunCDCAllCause(loc=cdcLoc,
startYear=2015,
curYear=2020,
weekThru=39,
startWeek=9,
lst=test_hier5_201130,
epiMap=readFromRDS("epiMonth"),
agePopData=readFromRDS("usPopBucket2020"),
cvDeathThru="2020-09-26",
cdcPlotStartWeek=10,
dlData=!file.exists(paste0("./RInputFiles/Coronavirus/", cdcLoc)),
stateNoCheck=c("NC")
)
## Rows: 180,816
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "A...
## $ `Week Ending Date` <chr> "01/10/2015", "01/17/2015", "01/24/2015", "01/...
## $ `State Abbreviation` <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"...
## $ Year <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015...
## $ Week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ `Age Group` <chr> "25-44 years", "25-44 years", "25-44 years", "...
## $ `Number of Deaths` <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50...
## $ `Time Period` <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2...
## $ Type <chr> "Predicted (weighted)", "Predicted (weighted)"...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 180,816
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <chr> "25-44 years", "25-44 years", "25-44 years", "25-44 ye...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2019", "2...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
## Check Control Levels and Record Counts for Renamed Data:
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
## age n n_deaths_na deaths
## <chr> <int> <int> <dbl>
## 1 25-44 years 27094 7 3318967
## 2 45-64 years 33072 13 12975960
## 3 65-74 years 33058 14 12893879
## 4 75-84 years 33078 15 16026289
## 5 85 years and older 33066 16 20874940
## 6 Under 25 years 21448 0 1425191
## `summarise()` regrouping output by 'period', 'year' (override with `.groups` argument)
## # A tibble: 12 x 6
## # Groups: period, year [6]
## period year type n n_deaths_na deaths
## <chr> <int> <chr> <int> <int> <dbl>
## 1 2015-2019 2015 Predicted (weighted) 15285 0 5416393
## 2 2015-2019 2015 Unweighted 15285 0 5416393
## 3 2015-2019 2016 Predicted (weighted) 15366 0 5483774
## 4 2015-2019 2016 Unweighted 15366 0 5483774
## 5 2015-2019 2017 Predicted (weighted) 15317 0 5643340
## 6 2015-2019 2017 Unweighted 15317 0 5643340
## 7 2015-2019 2018 Predicted (weighted) 15307 0 5698023
## 8 2015-2019 2018 Unweighted 15307 0 5698023
## 9 2015-2019 2019 Predicted (weighted) 15317 0 5725524
## 10 2015-2019 2019 Unweighted 15317 0 5725524
## 11 2020 2020 Predicted (weighted) 13840 38 5840770
## 12 2020 2020 Unweighted 13792 27 5740348
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 3 x 5
## # Groups: period [2]
## period Suppress n n_deaths_na deaths
## <chr> <chr> <int> <int> <dbl>
## 1 2015-2019 <NA> 153184 0 5.59e7
## 2 2020 Suppressed (counts highly incomplete, <5~ 65 65 0.
## 3 2020 <NA> 27567 0 1.16e7
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 9 x 5
## # Groups: period [2]
## period Note n n_deaths_na deaths
## <chr> <chr> <int> <int> <dbl>
## 1 2015-20~ <NA> 153184 0 5.59e7
## 2 2020 Data in recent weeks are incomplete. Only~ 22570 8 9.90e6
## 3 2020 Data in recent weeks are incomplete. Only~ 492 0 2.29e5
## 4 2020 Data in recent weeks are incomplete. Only~ 259 0 3.00e4
## 5 2020 Data in recent weeks are incomplete. Only~ 1925 57 4.47e5
## 6 2020 Data in recent weeks are incomplete. Only~ 24 0 1.31e4
## 7 2020 Estimates for Pennsylvania are too low fo~ 48 0 2.26e4
## 8 2020 Weights may be too low to account for und~ 168 0 4.54e4
## 9 2020 <NA> 2146 0 8.93e5
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## state Jurisdiction n n_deaths_na deaths
## 1 US United States 3684 0 33636341
## 2 CA California 3684 0 3198495
## 3 FL Florida 3684 0 2456535
## 4 TX Texas 3684 0 2424271
## 5 PA Pennsylvania 3684 0 1612787
## 6 OH Ohio 3684 0 1455156
## 7 IL Illinois 3684 0 1272621
## 8 NY New York 3684 0 1200005
## 9 MI Michigan 3684 0 1157271
## 10 NC North Carolina 3585 33 1072354
## 11 GA Georgia 3683 0 1009519
## 12 NJ New Jersey 3678 0 898785
## 13 TN Tennessee 3684 0 880363
## 14 VA Virginia 3684 0 808002
## 15 IN Indiana 3681 0 782822
## 16 MO Missouri 3682 0 762494
## 17 AZ Arizona 3684 0 712935
## 18 MA Massachusetts 3646 0 710364
## 19 YC New York City 3680 0 693824
## 20 WA Washington 3684 0 671307
## 21 AL Alabama 3683 0 624906
## 22 WI Wisconsin 3664 0 621043
## 23 MD Maryland 3678 0 594189
## 24 SC South Carolina 3680 0 586190
## 25 KY Kentucky 3642 0 569155
## 26 LA Louisiana 3680 0 549152
## 27 MN Minnesota 3637 0 526472
## 28 CO Colorado 3682 0 466081
## 29 OK Oklahoma 3672 0 465298
## 30 OR Oregon 3512 0 430304
## 31 MS Mississippi 3620 0 379957
## 32 AR Arkansas 3576 0 378866
## 33 CT Connecticut 3236 19 370453
## 34 IA Iowa 3313 0 355740
## 35 PR Puerto Rico 3392 0 344999
## 36 KS Kansas 3370 0 310278
## 37 NV Nevada 3415 0 301998
## 38 WV West Virginia 3102 13 258872
## 39 UT Utah 3571 0 223643
## 40 NM New Mexico 3251 0 213726
## 41 NE Nebraska 2960 0 197462
## 42 ME Maine 2750 0 166838
## 43 ID Idaho 2877 0 160328
## 44 NH New Hampshire 2772 0 140545
## 45 HI Hawaii 2658 0 129455
## 46 RI Rhode Island 2570 0 118811
## 47 MT Montana 2656 0 115410
## 48 DE Delaware 2662 0 103627
## 49 SD South Dakota 2544 0 91220
## 50 ND North Dakota 2526 0 78972
## 51 DC District of Columbia 2646 0 66641
## 52 VT Vermont 2428 0 64407
## 53 WY Wyoming 2411 0 49654
## 54 AK Alaska 2453 0 44283
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## Rows: 180,816
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 88,160
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
##
## *** Data suppression checks ***
## # A tibble: 14 x 11
## Jurisdiction weekEnding state year week age deaths period type Suppress
## <chr> <date> <chr> <fct> <int> <fct> <dbl> <fct> <chr> <chr>
## 1 North Carol~ 2020-09-12 NC 2020 37 25-4~ NA 2020 Pred~ Suppres~
## 2 North Carol~ 2020-09-26 NC 2020 39 25-4~ NA 2020 Pred~ Suppres~
## 3 North Carol~ 2020-09-12 NC 2020 37 45-6~ NA 2020 Pred~ Suppres~
## 4 North Carol~ 2020-09-19 NC 2020 38 45-6~ NA 2020 Pred~ Suppres~
## 5 North Carol~ 2020-09-26 NC 2020 39 45-6~ NA 2020 Pred~ Suppres~
## 6 North Carol~ 2020-09-12 NC 2020 37 65-7~ NA 2020 Pred~ Suppres~
## 7 North Carol~ 2020-09-19 NC 2020 38 65-7~ NA 2020 Pred~ Suppres~
## 8 North Carol~ 2020-09-26 NC 2020 39 65-7~ NA 2020 Pred~ Suppres~
## 9 North Carol~ 2020-09-12 NC 2020 37 75-8~ NA 2020 Pred~ Suppres~
## 10 North Carol~ 2020-09-19 NC 2020 38 75-8~ NA 2020 Pred~ Suppres~
## 11 North Carol~ 2020-09-26 NC 2020 39 75-8~ NA 2020 Pred~ Suppres~
## 12 North Carol~ 2020-09-12 NC 2020 37 85 y~ NA 2020 Pred~ Suppres~
## 13 North Carol~ 2020-09-19 NC 2020 38 85 y~ NA 2020 Pred~ Suppres~
## 14 North Carol~ 2020-09-26 NC 2020 39 85 y~ NA 2020 Pred~ Suppres~
## # ... with 1 more variable: Note <chr>
##
## *** Data suppression checks failed - total of 14 suppressions
## *** Of these suppressions, 9 are NOT from weekThru of current year
## Continuing since all states with problems are in stateNoCheck
## `summarise()` regrouping output by 'Jurisdiction', 'weekEnding', 'state', 'year', 'week', 'age', 'period', 'type' (override with `.groups` argument)
## Rows: 82,918
## Columns: 12
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, ...
## $ age <fct> Under 25 years, 25-44 years, 45-64 years, 65-74 years,...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ n <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ deaths <dbl> 25, 67, 253, 202, 272, 320, 28, 49, 256, 222, 253, 332...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
## First duplicate is in row number (0 means no duplicates): 0
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year', 'week' (override with `.groups` argument)
## `summarise()` regrouping output by 'year', 'week' (override with `.groups` argument)
## `summarise()` regrouping output by 'year', 'age', 'week' (override with `.groups` argument)
##
## Plots will be run after excluding stateNoCheck states
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'state', 'quarter', 'month' (override with `.groups` argument)
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "state"
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'age', 'quarter', 'month' (override with `.groups` argument)
## `summarise()` regrouping output by 'age' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
saveToRDS(cdcList_20201206, ovrWriteError=FALSE)
##
## File already exists: ./RInputFiles/Coronavirus/cdcList_20201206.RDS
##
## Not replacing the existing file since ovrWrite=FALSE
## NULL
A function is written to compare CDC deaths for the same time period in two different files:
cdcDeathCompare <- function(loc1,
loc2,
dir="./RInputFiles/Coronavirus/",
periodKeep="2015-2019",
weekThru=53,
stateNoCheck=c(state.abb, "DC"),
threshPct=0.0005,
returnList=FALSE
) {
# FUNCTION ARGUMENTS:
# loc1: the first raw CDC file (either character location of CSV or data frame)
# loc2: the second raw CDC file (either character location of CSV or data frame)
# dir: the directory containing the raw CDC files
# periodKeep: keep all data from this time period
# weekThru: keep all data that is from this week or earlier, even if not in periodKeep
# stateNoCheck: do not run error checks for these states (all in this case)
# threshPct: threshold for plotting as a difference
# returnList: boolean, whether to return a list including both CDC file and the final difference file
# if FALSE (default), only the difference file is returned, and it is returned as tibble
# Function to read raw CDC data
readRawCDC <- function(x) {
readProcessCDC(x, weekThru=weekThru, periodKeep=periodKeep, fDir=dir, stateNoCheck=stateNoCheck) %>%
group_by(weekEnding) %>%
summarize(deaths=sum(deaths, na.rm=TRUE), .groups="drop")
}
# Read the data if character, otherwise keep "as is"
if ("character" %in% class(loc1)) cdc1 <- readRawCDC(loc1)
else cdc1 <- loc1
if ("character" %in% class(loc2)) cdc2 <- readRawCDC(loc2)
else cdc2 <- loc2
# Merge the files
cdc <- select(cdc1, weekEnding, deaths1=deaths) %>%
full_join(select(cdc2, weekEnding, deaths2=deaths), by="weekEnding")
# Mapping file
if ("character" %in% class(loc1)) name1 <- stringr::str_extract(loc1, pattern="\\d{8}")
else name1 <- loc1 %>% pull(weekEnding) %>% max() %>% as.character() %>% stringr::str_replace_all("-", "")
if ("character" %in% class(loc2)) name2 <- stringr::str_extract(loc2, pattern="\\d{8}")
else name2 <- loc2 %>% pull(weekEnding) %>% max() %>% as.character() %>% stringr::str_replace_all("-", "")
mapFile <- c(name1, name2)
names(mapFile) <- c("deaths1", "deaths2")
# Plot differences
p1 <- cdc %>%
pivot_longer(-weekEnding) %>%
mutate(name=mapFile[name]) %>%
ggplot(aes(x=weekEnding, y=value)) +
geom_line(aes(group=name, color=name)) +
labs(x="",
y="Total deaths",
title="Change in total deaths (predicted) reported in CDC data by time period"
) +
scale_color_discrete("CDC Time Period") +
ylim(0, NA)
print(p1)
# Extract any weeks with change greater than threshPct
p2 <- cdc %>%
filter(!is.na(deaths1), !is.na(deaths2)) %>%
mutate(delta=deaths2-deaths1) %>%
filter(abs(delta) >= threshPct*pmax(deaths1, deaths2)) %>%
ggplot(aes(x=weekEnding, y=delta)) +
geom_col(aes(fill=delta>0)) +
geom_text(aes(y=delta+100, label=delta), hjust=0) +
labs(x="",
y="Delta between files (positive means more in second file)",
title="Change in CDC reported deaths by week",
subtitle=paste0("Includes only weeks with change at least ", round(threshPct*100, 3), "%\n",
"Dates: ", name1," and ", name2
)
) +
coord_flip() +
scale_fill_discrete("Positive Delta")
print(p2)
# Return the CDC data
if (returnList) list(cdc=cdc, cdc1=cdc1, cdc2=cdc2)
else cdc
}
loc1 <- "Weekly_counts_of_deaths_by_jurisdiction_and_age_group_downloaded_20200923.csv"
loc2 <- "Weekly_counts_of_deaths_by_jurisdiction_and_age_group_downloaded_20201206.csv"
cdcCompare_0923_1206 <- cdcDeathCompare(loc1=loc1, loc2=loc2, returnList=TRUE)
## Rows: 174,311
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "A...
## $ `Week Ending Date` <chr> "1/10/2015", "1/17/2015", "1/24/2015", "1/31/2...
## $ `State Abbreviation` <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"...
## $ Year <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015...
## $ Week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ `Age Group` <chr> "25-44 years", "25-44 years", "25-44 years", "...
## $ `Number of Deaths` <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50...
## $ `Time Period` <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2...
## $ Type <chr> "Predicted (weighted)", "Predicted (weighted)"...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 174,311
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <chr> "25-44 years", "25-44 years", "25-44 years", "25-44 ye...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2019", "2...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
## Check Control Levels and Record Counts for Renamed Data:
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
## age n n_deaths_na deaths
## <chr> <int> <int> <dbl>
## 1 25-44 years 26066 3 3170502
## 2 45-64 years 31884 7 12464677
## 3 65-74 years 31880 10 12347549
## 4 75-84 years 31902 11 15363924
## 5 85 years and older 31888 9 20067928
## 6 Under 25 years 20691 0 1372225
## `summarise()` regrouping output by 'period', 'year' (override with `.groups` argument)
## # A tibble: 12 x 6
## # Groups: period, year [6]
## period year type n n_deaths_na deaths
## <chr> <int> <chr> <int> <int> <dbl>
## 1 2015-2019 2015 Predicted (weighted) 15285 0 5416393
## 2 2015-2019 2015 Unweighted 15285 0 5416393
## 3 2015-2019 2016 Predicted (weighted) 15365 0 5483764
## 4 2015-2019 2016 Unweighted 15365 0 5483764
## 5 2015-2019 2017 Predicted (weighted) 15318 0 5643350
## 6 2015-2019 2017 Unweighted 15318 0 5643350
## 7 2015-2019 2018 Predicted (weighted) 15305 0 5698002
## 8 2015-2019 2018 Unweighted 15305 0 5698002
## 9 2015-2019 2019 Predicted (weighted) 15319 0 5725516
## 10 2015-2019 2019 Unweighted 15319 0 5725516
## 11 2020 2020 Predicted (weighted) 10586 24 4476340
## 12 2020 2020 Unweighted 10541 16 4376415
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 3 x 5
## # Groups: period [2]
## period Suppress n n_deaths_na deaths
## <chr> <chr> <int> <int> <dbl>
## 1 2015-2019 <NA> 153184 0 5.59e7
## 2 2020 Suppressed (counts highly incomplete, <5~ 40 40 0.
## 3 2020 <NA> 21087 0 8.85e6
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 9 x 5
## # Groups: period [2]
## period Note n n_deaths_na deaths
## <chr> <chr> <int> <int> <dbl>
## 1 2015-20~ <NA> 153184 0 5.59e7
## 2 2020 Data in recent weeks are incomplete. Only~ 16591 0 7.27e6
## 3 2020 Data in recent weeks are incomplete. Only~ 324 0 1.52e5
## 4 2020 Data in recent weeks are incomplete. Only~ 288 30 3.05e4
## 5 2020 Data in recent weeks are incomplete. Only~ 1502 10 4.17e5
## 6 2020 Data in recent weeks are incomplete. Only~ 60 0 2.71e4
## 7 2020 Estimates for Pennsylvania are too low fo~ 48 0 2.23e4
## 8 2020 Weights may be too low to account for und~ 436 0 1.40e5
## 9 2020 <NA> 1878 0 7.97e5
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## state Jurisdiction n n_deaths_na deaths
## 1 US United States 3552 0 32276762
## 2 CA California 3552 0 3072016
## 3 FL Florida 3552 0 2357528
## 4 TX Texas 3552 0 2314502
## 5 PA Pennsylvania 3552 0 1548716
## 6 OH Ohio 3552 0 1395014
## 7 IL Illinois 3552 0 1219119
## 8 NY New York 3552 0 1155790
## 9 MI Michigan 3552 0 1111203
## 10 NC North Carolina 3521 17 1051632
## 11 GA Georgia 3551 0 965504
## 12 NJ New Jersey 3546 0 867210
## 13 TN Tennessee 3552 0 840787
## 14 VA Virginia 3552 0 774383
## 15 IN Indiana 3550 0 749760
## 16 MO Missouri 3548 0 728220
## 17 MA Massachusetts 3516 0 685409
## 18 AZ Arizona 3552 0 684537
## 19 YC New York City 3548 0 671106
## 20 WA Washington 3551 0 645406
## 21 AL Alabama 3550 0 598526
## 22 WI Wisconsin 3533 0 592047
## 23 MD Maryland 3546 0 570238
## 24 SC South Carolina 3549 0 560415
## 25 KY Kentucky 3519 0 545032
## 26 LA Louisiana 3545 0 525668
## 27 MN Minnesota 3509 0 503567
## 28 CO Colorado 3550 0 446708
## 29 OK Oklahoma 3541 0 445362
## 30 OR Oregon 3382 0 413553
## 31 MS Mississippi 3488 0 363792
## 32 AR Arkansas 3444 0 361612
## 33 CT Connecticut 3106 13 356416
## 34 IA Iowa 3190 0 339791
## 35 PR Puerto Rico 3272 0 331654
## 36 KS Kansas 3246 0 296520
## 37 NV Nevada 3291 0 289275
## 38 WV West Virginia 3011 10 251046
## 39 UT Utah 3438 0 213931
## 40 NM New Mexico 3140 0 205026
## 41 NE Nebraska 2846 0 188492
## 42 ME Maine 2646 0 160341
## 43 ID Idaho 2766 0 152936
## 44 NH New Hampshire 2672 0 135110
## 45 HI Hawaii 2556 0 124379
## 46 RI Rhode Island 2474 0 114274
## 47 MT Montana 2556 0 109719
## 48 DE Delaware 2558 0 99625
## 49 SD South Dakota 2448 0 86461
## 50 ND North Dakota 2433 0 75079
## 51 DC District of Columbia 2545 0 64202
## 52 VT Vermont 2336 0 61925
## 53 WY Wyoming 2318 0 47230
## 54 AK Alaska 2352 0 42249
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## Rows: 174,311
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 87,178
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
##
## *** Data suppression checks ***
## # A tibble: 24 x 11
## Jurisdiction weekEnding state year week age deaths period type Suppress
## <chr> <date> <chr> <fct> <int> <fct> <dbl> <fct> <chr> <chr>
## 1 Connecticut 2020-08-15 CT 2020 33 45-6~ NA 2020 Pred~ Suppres~
## 2 Connecticut 2020-08-22 CT 2020 34 45-6~ NA 2020 Pred~ Suppres~
## 3 Connecticut 2020-08-15 CT 2020 33 65-7~ NA 2020 Pred~ Suppres~
## 4 Connecticut 2020-08-22 CT 2020 34 65-7~ NA 2020 Pred~ Suppres~
## 5 Connecticut 2020-08-15 CT 2020 33 75-8~ NA 2020 Pred~ Suppres~
## 6 Connecticut 2020-08-22 CT 2020 34 75-8~ NA 2020 Pred~ Suppres~
## 7 Connecticut 2020-08-15 CT 2020 33 85 y~ NA 2020 Pred~ Suppres~
## 8 North Carol~ 2020-08-15 NC 2020 33 25-4~ NA 2020 Pred~ Suppres~
## 9 North Carol~ 2020-08-15 NC 2020 33 45-6~ NA 2020 Pred~ Suppres~
## 10 North Carol~ 2020-08-22 NC 2020 34 45-6~ NA 2020 Pred~ Suppres~
## # ... with 14 more rows, and 1 more variable: Note <chr>
##
## *** Data suppression checks failed - total of 24 suppressions
## *** Of these suppressions, 24 are NOT from weekThru of current year
## Continuing since all states with problems are in stateNoCheck
## `summarise()` regrouping output by 'Jurisdiction', 'weekEnding', 'state', 'year', 'week', 'age', 'period', 'type' (override with `.groups` argument)
## Rows: 81,990
## Columns: 12
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, ...
## $ age <fct> Under 25 years, 25-44 years, 45-64 years, 65-74 years,...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ n <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ deaths <dbl> 25, 67, 253, 202, 272, 320, 28, 49, 256, 222, 253, 332...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
## First duplicate is in row number (0 means no duplicates): 0Rows: 180,816
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "A...
## $ `Week Ending Date` <chr> "01/10/2015", "01/17/2015", "01/24/2015", "01/...
## $ `State Abbreviation` <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"...
## $ Year <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015...
## $ Week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ `Age Group` <chr> "25-44 years", "25-44 years", "25-44 years", "...
## $ `Number of Deaths` <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50...
## $ `Time Period` <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2...
## $ Type <chr> "Predicted (weighted)", "Predicted (weighted)"...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 180,816
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <chr> "25-44 years", "25-44 years", "25-44 years", "25-44 ye...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2019", "2...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
## Check Control Levels and Record Counts for Renamed Data:
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
## age n n_deaths_na deaths
## <chr> <int> <int> <dbl>
## 1 25-44 years 27094 7 3318967
## 2 45-64 years 33072 13 12975960
## 3 65-74 years 33058 14 12893879
## 4 75-84 years 33078 15 16026289
## 5 85 years and older 33066 16 20874940
## 6 Under 25 years 21448 0 1425191
## `summarise()` regrouping output by 'period', 'year' (override with `.groups` argument)
## # A tibble: 12 x 6
## # Groups: period, year [6]
## period year type n n_deaths_na deaths
## <chr> <int> <chr> <int> <int> <dbl>
## 1 2015-2019 2015 Predicted (weighted) 15285 0 5416393
## 2 2015-2019 2015 Unweighted 15285 0 5416393
## 3 2015-2019 2016 Predicted (weighted) 15366 0 5483774
## 4 2015-2019 2016 Unweighted 15366 0 5483774
## 5 2015-2019 2017 Predicted (weighted) 15317 0 5643340
## 6 2015-2019 2017 Unweighted 15317 0 5643340
## 7 2015-2019 2018 Predicted (weighted) 15307 0 5698023
## 8 2015-2019 2018 Unweighted 15307 0 5698023
## 9 2015-2019 2019 Predicted (weighted) 15317 0 5725524
## 10 2015-2019 2019 Unweighted 15317 0 5725524
## 11 2020 2020 Predicted (weighted) 13840 38 5840770
## 12 2020 2020 Unweighted 13792 27 5740348
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 3 x 5
## # Groups: period [2]
## period Suppress n n_deaths_na deaths
## <chr> <chr> <int> <int> <dbl>
## 1 2015-2019 <NA> 153184 0 5.59e7
## 2 2020 Suppressed (counts highly incomplete, <5~ 65 65 0.
## 3 2020 <NA> 27567 0 1.16e7
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 9 x 5
## # Groups: period [2]
## period Note n n_deaths_na deaths
## <chr> <chr> <int> <int> <dbl>
## 1 2015-20~ <NA> 153184 0 5.59e7
## 2 2020 Data in recent weeks are incomplete. Only~ 22570 8 9.90e6
## 3 2020 Data in recent weeks are incomplete. Only~ 492 0 2.29e5
## 4 2020 Data in recent weeks are incomplete. Only~ 259 0 3.00e4
## 5 2020 Data in recent weeks are incomplete. Only~ 1925 57 4.47e5
## 6 2020 Data in recent weeks are incomplete. Only~ 24 0 1.31e4
## 7 2020 Estimates for Pennsylvania are too low fo~ 48 0 2.26e4
## 8 2020 Weights may be too low to account for und~ 168 0 4.54e4
## 9 2020 <NA> 2146 0 8.93e5
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## state Jurisdiction n n_deaths_na deaths
## 1 US United States 3684 0 33636341
## 2 CA California 3684 0 3198495
## 3 FL Florida 3684 0 2456535
## 4 TX Texas 3684 0 2424271
## 5 PA Pennsylvania 3684 0 1612787
## 6 OH Ohio 3684 0 1455156
## 7 IL Illinois 3684 0 1272621
## 8 NY New York 3684 0 1200005
## 9 MI Michigan 3684 0 1157271
## 10 NC North Carolina 3585 33 1072354
## 11 GA Georgia 3683 0 1009519
## 12 NJ New Jersey 3678 0 898785
## 13 TN Tennessee 3684 0 880363
## 14 VA Virginia 3684 0 808002
## 15 IN Indiana 3681 0 782822
## 16 MO Missouri 3682 0 762494
## 17 AZ Arizona 3684 0 712935
## 18 MA Massachusetts 3646 0 710364
## 19 YC New York City 3680 0 693824
## 20 WA Washington 3684 0 671307
## 21 AL Alabama 3683 0 624906
## 22 WI Wisconsin 3664 0 621043
## 23 MD Maryland 3678 0 594189
## 24 SC South Carolina 3680 0 586190
## 25 KY Kentucky 3642 0 569155
## 26 LA Louisiana 3680 0 549152
## 27 MN Minnesota 3637 0 526472
## 28 CO Colorado 3682 0 466081
## 29 OK Oklahoma 3672 0 465298
## 30 OR Oregon 3512 0 430304
## 31 MS Mississippi 3620 0 379957
## 32 AR Arkansas 3576 0 378866
## 33 CT Connecticut 3236 19 370453
## 34 IA Iowa 3313 0 355740
## 35 PR Puerto Rico 3392 0 344999
## 36 KS Kansas 3370 0 310278
## 37 NV Nevada 3415 0 301998
## 38 WV West Virginia 3102 13 258872
## 39 UT Utah 3571 0 223643
## 40 NM New Mexico 3251 0 213726
## 41 NE Nebraska 2960 0 197462
## 42 ME Maine 2750 0 166838
## 43 ID Idaho 2877 0 160328
## 44 NH New Hampshire 2772 0 140545
## 45 HI Hawaii 2658 0 129455
## 46 RI Rhode Island 2570 0 118811
## 47 MT Montana 2656 0 115410
## 48 DE Delaware 2662 0 103627
## 49 SD South Dakota 2544 0 91220
## 50 ND North Dakota 2526 0 78972
## 51 DC District of Columbia 2646 0 66641
## 52 VT Vermont 2428 0 64407
## 53 WY Wyoming 2411 0 49654
## 54 AK Alaska 2453 0 44283
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## Rows: 180,816
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 90,432
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
##
## *** Data suppression checks ***
## # A tibble: 38 x 11
## Jurisdiction weekEnding state year week age deaths period type Suppress
## <chr> <date> <chr> <fct> <int> <fct> <dbl> <fct> <chr> <chr>
## 1 Connecticut 2020-11-07 CT 2020 45 25-4~ NA 2020 Pred~ Suppres~
## 2 Connecticut 2020-11-14 CT 2020 46 25-4~ NA 2020 Pred~ Suppres~
## 3 Connecticut 2020-11-07 CT 2020 45 45-6~ NA 2020 Pred~ Suppres~
## 4 Connecticut 2020-11-14 CT 2020 46 45-6~ NA 2020 Pred~ Suppres~
## 5 Connecticut 2020-11-21 CT 2020 47 45-6~ NA 2020 Pred~ Suppres~
## 6 Connecticut 2020-11-07 CT 2020 45 65-7~ NA 2020 Pred~ Suppres~
## 7 Connecticut 2020-11-14 CT 2020 46 65-7~ NA 2020 Pred~ Suppres~
## 8 Connecticut 2020-11-07 CT 2020 45 75-8~ NA 2020 Pred~ Suppres~
## 9 Connecticut 2020-11-14 CT 2020 46 75-8~ NA 2020 Pred~ Suppres~
## 10 Connecticut 2020-11-21 CT 2020 47 75-8~ NA 2020 Pred~ Suppres~
## # ... with 28 more rows, and 1 more variable: Note <chr>
##
## *** Data suppression checks failed - total of 38 suppressions
## *** Of these suppressions, 38 are NOT from weekThru of current year
## Continuing since all states with problems are in stateNoCheck
## `summarise()` regrouping output by 'Jurisdiction', 'weekEnding', 'state', 'year', 'week', 'age', 'period', 'type' (override with `.groups` argument)
## Rows: 85,052
## Columns: 12
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, ...
## $ age <fct> Under 25 years, 25-44 years, 45-64 years, 65-74 years,...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ n <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ deaths <dbl> 25, 67, 253, 202, 272, 320, 28, 49, 256, 222, 253, 332...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
## First duplicate is in row number (0 means no duplicates): 0
## Warning: Removed 11 row(s) containing missing values (geom_path).
str(cdcCompare_0923_1206)
## List of 3
## $ cdc : tibble [307 x 3] (S3: tbl_df/tbl/data.frame)
## ..$ weekEnding: Date[1:307], format: "2015-01-10" "2015-01-17" ...
## ..$ deaths1 : num [1:307] 61684 61061 58613 57206 57205 ...
## ..$ deaths2 : num [1:307] 61685 61061 58604 57214 57205 ...
## $ cdc1: tibble [296 x 2] (S3: tbl_df/tbl/data.frame)
## ..$ weekEnding: Date[1:296], format: "2015-01-10" "2015-01-17" ...
## ..$ deaths : num [1:296] 61684 61061 58613 57206 57205 ...
## $ cdc2: tibble [307 x 2] (S3: tbl_df/tbl/data.frame)
## ..$ weekEnding: Date[1:307], format: "2015-01-10" "2015-01-17" ...
## ..$ deaths : num [1:307] 61685 61061 58604 57214 57205 ...
There is eventual modest restatement of CDC data in most weeks, and more significant restatement of CDC data in the four most recent weeks of the September 23, 2020 data file (last week available week ending September 5, last mostly non-restated data week ending August 8).
The function is updated to return either a list (includes each CDC data file) or just the differences file.
The function is run again for two different time periods:
loc3 <- "Weekly_counts_of_deaths_by_jurisdiction_and_age_group_downloaded_20201014.csv"
loc4 <- "Weekly_counts_of_deaths_by_jurisdiction_and_age_group_downloaded_20201120.csv"
cdcCompare_1014_1120 <- cdcDeathCompare(loc1=loc3, loc2=loc4, returnList=TRUE)
## Rows: 176,119
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "A...
## $ `Week Ending Date` <chr> "01/10/2015", "01/17/2015", "01/24/2015", "01/...
## $ `State Abbreviation` <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"...
## $ Year <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015...
## $ Week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ `Age Group` <chr> "25-44 years", "25-44 years", "25-44 years", "...
## $ `Number of Deaths` <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50...
## $ `Time Period` <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2...
## $ Type <chr> "Predicted (weighted)", "Predicted (weighted)"...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 176,119
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <chr> "25-44 years", "25-44 years", "25-44 years", "25-44 ye...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2019", "2...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
## Check Control Levels and Record Counts for Renamed Data:
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
## age n n_deaths_na deaths
## <chr> <int> <int> <dbl>
## 1 25-44 years 26364 0 3214460
## 2 45-64 years 32211 4 12612924
## 3 65-74 years 32200 5 12500875
## 4 75-84 years 32221 6 15546198
## 5 85 years and older 32210 7 20287067
## 6 Under 25 years 20913 0 1388384
## `summarise()` regrouping output by 'period', 'year' (override with `.groups` argument)
## # A tibble: 12 x 6
## # Groups: period, year [6]
## period year type n n_deaths_na deaths
## <chr> <int> <chr> <int> <int> <dbl>
## 1 2015-2019 2015 Predicted (weighted) 15285 0 5416393
## 2 2015-2019 2015 Unweighted 15285 0 5416393
## 3 2015-2019 2016 Predicted (weighted) 15365 0 5483764
## 4 2015-2019 2016 Unweighted 15365 0 5483764
## 5 2015-2019 2017 Predicted (weighted) 15319 0 5643363
## 6 2015-2019 2017 Unweighted 15319 0 5643363
## 7 2015-2019 2018 Predicted (weighted) 15305 0 5698004
## 8 2015-2019 2018 Unweighted 15305 0 5698004
## 9 2015-2019 2019 Predicted (weighted) 15317 0 5725524
## 10 2015-2019 2019 Unweighted 15317 0 5725524
## 11 2020 2020 Predicted (weighted) 11490 13 4858272
## 12 2020 2020 Unweighted 11447 9 4757540
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 3 x 5
## # Groups: period [2]
## period Suppress n n_deaths_na deaths
## <chr> <chr> <int> <int> <dbl>
## 1 2015-2019 <NA> 153182 0 5.59e7
## 2 2020 Suppressed (counts highly incomplete, <5~ 22 22 0.
## 3 2020 <NA> 22915 0 9.62e6
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 10 x 5
## # Groups: period [2]
## period Note n n_deaths_na deaths
## <chr> <chr> <int> <int> <dbl>
## 1 2015-20~ <NA> 153182 0 5.59e7
## 2 2020 Data in recent weeks are incomplete. Onl~ 18444 0 8.13e6
## 3 2020 Data in recent weeks are incomplete. Onl~ 360 0 1.68e5
## 4 2020 Data in recent weeks are incomplete. Onl~ 382 19 4.45e4
## 5 2020 Data in recent weeks are incomplete. Onl~ 1329 3 2.88e5
## 6 2020 Data in recent weeks are incomplete. Onl~ 60 0 2.79e4
## 7 2020 Estimates for Pennsylvania are too low f~ 36 0 1.68e4
## 8 2020 Weights may be too low to account for un~ 284 0 7.19e4
## 9 2020 Weights may be too low to account for un~ 12 0 5.72e3
## 10 2020 <NA> 2030 0 8.65e5
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## state Jurisdiction n n_deaths_na deaths
## 1 US United States 3588 0 32655734
## 2 CA California 3588 0 3108728
## 3 FL Florida 3588 0 2387134
## 4 TX Texas 3588 0 2348924
## 5 PA Pennsylvania 3588 0 1566142
## 6 OH Ohio 3588 0 1411315
## 7 IL Illinois 3588 0 1232847
## 8 NY New York 3588 0 1167718
## 9 MI Michigan 3588 0 1123550
## 10 NC North Carolina 3536 8 1059363
## 11 GA Georgia 3588 0 978193
## 12 NJ New Jersey 3580 0 875776
## 13 TN Tennessee 3588 0 851531
## 14 VA Virginia 3588 0 782913
## 15 IN Indiana 3587 0 758488
## 16 MO Missouri 3586 0 737546
## 17 AZ Arizona 3588 0 692460
## 18 MA Massachusetts 3554 0 692109
## 19 YC New York City 3584 0 676973
## 20 WA Washington 3588 0 652928
## 21 AL Alabama 3585 0 606105
## 22 WI Wisconsin 3570 0 598878
## 23 MD Maryland 3582 0 576822
## 24 SC South Carolina 3586 0 568254
## 25 KY Kentucky 3553 0 552088
## 26 LA Louisiana 3577 0 532758
## 27 MN Minnesota 3547 0 509530
## 28 CO Colorado 3586 0 451948
## 29 OK Oklahoma 3576 0 450316
## 30 OR Oregon 3418 0 418422
## 31 MS Mississippi 3522 0 368372
## 32 AR Arkansas 3484 0 366066
## 33 CT Connecticut 3146 9 360662
## 34 IA Iowa 3224 0 343678
## 35 PR Puerto Rico 3305 0 336026
## 36 KS Kansas 3282 0 300158
## 37 NV Nevada 3327 0 292844
## 38 WV West Virginia 3033 3 253132
## 39 UT Utah 3474 0 216546
## 40 NM New Mexico 3170 0 207657
## 41 NE Nebraska 2878 0 190649
## 42 ME Maine 2676 0 162212
## 43 ID Idaho 2796 0 154882
## 44 NH New Hampshire 2700 0 136571
## 45 HI Hawaii 2588 0 125932
## 46 RI Rhode Island 2504 0 115701
## 47 MT Montana 2584 0 111052
## 48 DE Delaware 2590 0 100672
## 49 SD South Dakota 2478 0 87630
## 50 ND North Dakota 2459 0 75823
## 51 DC District of Columbia 2571 0 64792
## 52 VT Vermont 2362 0 62663
## 53 WY Wyoming 2348 0 47960
## 54 AK Alaska 2379 2 42735
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## Rows: 176,119
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 88,081
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
##
## *** Data suppression checks ***
## # A tibble: 13 x 11
## Jurisdiction weekEnding state year week age deaths period type Suppress
## <chr> <date> <chr> <fct> <int> <fct> <dbl> <fct> <chr> <chr>
## 1 Alaska 2020-09-26 AK 2020 39 85 y~ NA 2020 Pred~ Suppres~
## 2 Connecticut 2020-09-12 CT 2020 37 45-6~ NA 2020 Pred~ Suppres~
## 3 Connecticut 2020-09-12 CT 2020 37 65-7~ NA 2020 Pred~ Suppres~
## 4 Connecticut 2020-09-12 CT 2020 37 75-8~ NA 2020 Pred~ Suppres~
## 5 Connecticut 2020-09-19 CT 2020 38 75-8~ NA 2020 Pred~ Suppres~
## 6 Connecticut 2020-09-12 CT 2020 37 85 y~ NA 2020 Pred~ Suppres~
## 7 North Carol~ 2020-08-29 NC 2020 35 45-6~ NA 2020 Pred~ Suppres~
## 8 North Carol~ 2020-08-29 NC 2020 35 65-7~ NA 2020 Pred~ Suppres~
## 9 North Carol~ 2020-08-29 NC 2020 35 75-8~ NA 2020 Pred~ Suppres~
## 10 North Carol~ 2020-08-29 NC 2020 35 85 y~ NA 2020 Pred~ Suppres~
## 11 West Virgin~ 2020-09-26 WV 2020 39 65-7~ NA 2020 Pred~ Suppres~
## 12 West Virgin~ 2020-09-26 WV 2020 39 75-8~ NA 2020 Pred~ Suppres~
## 13 West Virgin~ 2020-09-26 WV 2020 39 85 y~ NA 2020 Pred~ Suppres~
## # ... with 1 more variable: Note <chr>
##
## *** Data suppression checks failed - total of 13 suppressions
## *** Of these suppressions, 13 are NOT from weekThru of current year
## Continuing since all states with problems are in stateNoCheck
## `summarise()` regrouping output by 'Jurisdiction', 'weekEnding', 'state', 'year', 'week', 'age', 'period', 'type' (override with `.groups` argument)
## Rows: 82,841
## Columns: 12
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, ...
## $ age <fct> Under 25 years, 25-44 years, 45-64 years, 65-74 years,...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ n <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ deaths <dbl> 25, 67, 253, 202, 272, 320, 28, 49, 256, 222, 253, 332...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
## First duplicate is in row number (0 means no duplicates): 0Rows: 179,663
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "A...
## $ `Week Ending Date` <chr> "01/10/2015", "01/17/2015", "01/24/2015", "01/...
## $ `State Abbreviation` <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"...
## $ Year <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015...
## $ Week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ `Age Group` <chr> "25-44 years", "25-44 years", "25-44 years", "...
## $ `Number of Deaths` <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50...
## $ `Time Period` <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2...
## $ Type <chr> "Predicted (weighted)", "Predicted (weighted)"...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 179,663
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <chr> "25-44 years", "25-44 years", "25-44 years", "25-44 ye...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2019", "2...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
## Check Control Levels and Record Counts for Renamed Data:
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
## age n n_deaths_na deaths
## <chr> <int> <int> <dbl>
## 1 25-44 years 26907 4 3293947
## 2 45-64 years 32860 15 12885563
## 3 65-74 years 32851 16 12792037
## 4 75-84 years 32870 19 15899092
## 5 85 years and older 32856 19 20718264
## 6 Under 25 years 21319 2 1416502
## `summarise()` regrouping output by 'period', 'year' (override with `.groups` argument)
## # A tibble: 12 x 6
## # Groups: period, year [6]
## period year type n n_deaths_na deaths
## <chr> <int> <chr> <int> <int> <dbl>
## 1 2015-2019 2015 Predicted (weighted) 15285 0 5416391
## 2 2015-2019 2015 Unweighted 15285 0 5416391
## 3 2015-2019 2016 Predicted (weighted) 15365 0 5483764
## 4 2015-2019 2016 Unweighted 15365 0 5483764
## 5 2015-2019 2017 Predicted (weighted) 15318 0 5643347
## 6 2015-2019 2017 Unweighted 15318 0 5643347
## 7 2015-2019 2018 Predicted (weighted) 15307 0 5698022
## 8 2015-2019 2018 Unweighted 15307 0 5698022
## 9 2015-2019 2019 Predicted (weighted) 15318 0 5725502
## 10 2015-2019 2019 Unweighted 15318 0 5725502
## 11 2020 2020 Predicted (weighted) 13260 41 5584252
## 12 2020 2020 Unweighted 13217 34 5487101
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 3 x 5
## # Groups: period [2]
## period Suppress n n_deaths_na deaths
## <chr> <chr> <int> <int> <dbl>
## 1 2015-2019 <NA> 153186 0 5.59e7
## 2 2020 Suppressed (counts highly incomplete, <5~ 75 75 0.
## 3 2020 <NA> 26402 0 1.11e7
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 9 x 5
## # Groups: period [2]
## period Note n n_deaths_na deaths
## <chr> <chr> <int> <int> <dbl>
## 1 2015-20~ <NA> 153186 0 5.59e7
## 2 2020 Data in recent weeks are incomplete. Only~ 21043 34 9.13e6
## 3 2020 Data in recent weeks are incomplete. Only~ 444 0 2.04e5
## 4 2020 Data in recent weeks are incomplete. Only~ 339 22 4.45e4
## 5 2020 Data in recent weeks are incomplete. Only~ 2241 19 7.10e5
## 6 2020 Data in recent weeks are incomplete. Only~ 48 0 2.62e4
## 7 2020 Estimates for Pennsylvania are too low fo~ 48 0 2.26e4
## 8 2020 Weights may be too low to account for und~ 312 0 1.16e5
## 9 2020 <NA> 2002 0 8.22e5
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## state Jurisdiction n n_deaths_na deaths
## 1 US United States 3660 0 33382812
## 2 CA California 3660 0 3175134
## 3 FL Florida 3660 0 2439960
## 4 TX Texas 3660 0 2405990
## 5 PA Pennsylvania 3660 0 1600457
## 6 OH Ohio 3660 0 1443190
## 7 IL Illinois 3660 0 1260836
## 8 NY New York 3660 0 1191716
## 9 MI Michigan 3660 0 1148050
## 10 NC North Carolina 3573 33 1068292
## 11 GA Georgia 3659 0 1002322
## 12 NJ New Jersey 3654 0 892684
## 13 TN Tennessee 3660 0 872960
## 14 VA Virginia 3660 0 802114
## 15 IN Indiana 3658 0 776857
## 16 MO Missouri 3656 0 755398
## 17 AZ Arizona 3660 0 707602
## 18 MA Massachusetts 3624 0 705559
## 19 YC New York City 3656 0 689474
## 20 WA Washington 3658 10 664276
## 21 AL Alabama 3658 0 620268
## 22 WI Wisconsin 3640 0 614607
## 23 MD Maryland 3654 0 589675
## 24 SC South Carolina 3658 0 582138
## 25 KY Kentucky 3621 0 564915
## 26 LA Louisiana 3655 0 545500
## 27 MN Minnesota 3612 0 521445
## 28 CO Colorado 3657 0 462126
## 29 OK Oklahoma 3649 0 461666
## 30 OR Oregon 3490 0 427650
## 31 MS Mississippi 3595 0 376968
## 32 AR Arkansas 3552 0 375477
## 33 CT Connecticut 3215 17 367731
## 34 IA Iowa 3293 0 352522
## 35 PR Puerto Rico 3372 0 343055
## 36 KS Kansas 3348 0 307450
## 37 NV Nevada 3394 0 299546
## 38 WV West Virginia 3098 11 258798
## 39 UT Utah 3546 0 221678
## 40 NM New Mexico 3231 0 212208
## 41 NE Nebraska 2942 0 195453
## 42 ME Maine 2732 0 165642
## 43 ID Idaho 2856 0 158780
## 44 NH New Hampshire 2751 0 139437
## 45 HI Hawaii 2642 0 128673
## 46 RI Rhode Island 2553 0 117919
## 47 MT Montana 2638 0 114177
## 48 DE Delaware 2645 0 102858
## 49 SD South Dakota 2522 4 89560
## 50 ND North Dakota 2512 0 78477
## 51 DC District of Columbia 2630 0 66270
## 52 VT Vermont 2416 0 64000
## 53 WY Wyoming 2395 0 49201
## 54 AK Alaska 2433 0 43852
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## Rows: 179,663
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 89,853
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
##
## *** Data suppression checks ***
## # A tibble: 41 x 11
## Jurisdiction weekEnding state year week age deaths period type Suppress
## <chr> <date> <chr> <fct> <int> <fct> <dbl> <fct> <chr> <chr>
## 1 Connecticut 2020-10-24 CT 2020 43 45-6~ NA 2020 Pred~ Suppres~
## 2 Connecticut 2020-10-31 CT 2020 44 45-6~ NA 2020 Pred~ Suppres~
## 3 Connecticut 2020-10-24 CT 2020 43 65-7~ NA 2020 Pred~ Suppres~
## 4 Connecticut 2020-10-31 CT 2020 44 65-7~ NA 2020 Pred~ Suppres~
## 5 Connecticut 2020-10-24 CT 2020 43 75-8~ NA 2020 Pred~ Suppres~
## 6 Connecticut 2020-10-31 CT 2020 44 75-8~ NA 2020 Pred~ Suppres~
## 7 Connecticut 2020-11-07 CT 2020 45 75-8~ NA 2020 Pred~ Suppres~
## 8 Connecticut 2020-10-24 CT 2020 43 85 y~ NA 2020 Pred~ Suppres~
## 9 Connecticut 2020-10-31 CT 2020 44 85 y~ NA 2020 Pred~ Suppres~
## 10 North Carol~ 2020-09-05 NC 2020 36 25-4~ NA 2020 Pred~ Suppres~
## # ... with 31 more rows, and 1 more variable: Note <chr>
##
## *** Data suppression checks failed - total of 41 suppressions
## *** Of these suppressions, 41 are NOT from weekThru of current year
## Continuing since all states with problems are in stateNoCheck
## `summarise()` regrouping output by 'Jurisdiction', 'weekEnding', 'state', 'year', 'week', 'age', 'period', 'type' (override with `.groups` argument)
## Rows: 84,507
## Columns: 12
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, ...
## $ age <fct> Under 25 years, 25-44 years, 45-64 years, 65-74 years,...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ n <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ deaths <dbl> 25, 67, 253, 202, 272, 320, 28, 49, 256, 222, 253, 332...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
## First duplicate is in row number (0 means no duplicates): 0
## Warning: Removed 6 row(s) containing missing values (geom_path).
str(cdcCompare_1014_1120)
## List of 3
## $ cdc : tibble [305 x 3] (S3: tbl_df/tbl/data.frame)
## ..$ weekEnding: Date[1:305], format: "2015-01-10" "2015-01-17" ...
## ..$ deaths1 : num [1:305] 61684 61069 58605 57206 57205 ...
## ..$ deaths2 : num [1:305] 61684 61062 58612 57206 57205 ...
## $ cdc1: tibble [299 x 2] (S3: tbl_df/tbl/data.frame)
## ..$ weekEnding: Date[1:299], format: "2015-01-10" "2015-01-17" ...
## ..$ deaths : num [1:299] 61684 61069 58605 57206 57205 ...
## $ cdc2: tibble [305 x 2] (S3: tbl_df/tbl/data.frame)
## ..$ weekEnding: Date[1:305], format: "2015-01-10" "2015-01-17" ...
## ..$ deaths : num [1:305] 61684 61062 58612 57206 57205 ...
A similar pattern is seen, with larger restatements in the more recent months of the older data file.
The function is updated to allow for taking a data frame or character path to a CSV. Direct use of the file is for example:
cdcCompare_1014_1206 <- cdcDeathCompare(loc1=cdcCompare_1014_1120$cdc1,
loc2=cdcCompare_0923_1206$cdc2,
returnList=FALSE
)
## Warning: Removed 8 row(s) containing missing values (geom_path).
str(cdcCompare_1014_1206)
## tibble [307 x 3] (S3: tbl_df/tbl/data.frame)
## $ weekEnding: Date[1:307], format: "2015-01-10" "2015-01-17" ...
## $ deaths1 : num [1:307] 61684 61069 58605 57206 57205 ...
## $ deaths2 : num [1:307] 61685 61061 58604 57214 57205 ...
Data are integrated from across four files:
cdcFourFiles <- cdcCompare_0923_1206$cdc %>%
rename('20200923'=deaths1, '20201206'=deaths2) %>%
full_join(rename(cdcCompare_1014_1120$cdc, '20201014'=deaths1, '20201120'=deaths2)) %>%
pivot_longer(-weekEnding, names_to="source", values_to="reportedDeaths") %>%
mutate(source=lubridate::ymd(source)) %>%
arrange(weekEnding, source)
## Joining, by = "weekEnding"
cdcFourFiles
## # A tibble: 1,228 x 3
## weekEnding source reportedDeaths
## <date> <date> <dbl>
## 1 2015-01-10 2020-09-23 61684
## 2 2015-01-10 2020-10-14 61684
## 3 2015-01-10 2020-11-20 61684
## 4 2015-01-10 2020-12-06 61685
## 5 2015-01-17 2020-09-23 61061
## 6 2015-01-17 2020-10-14 61069
## 7 2015-01-17 2020-11-20 61062
## 8 2015-01-17 2020-12-06 61061
## 9 2015-01-24 2020-09-23 58613
## 10 2015-01-24 2020-10-14 58605
## # ... with 1,218 more rows
# Function to plot restatement levels for a file
plotCDCRestatement <- function(df,
keyVar="reportedDeaths",
keyYears=c(2020),
firstWeek=NULL
) {
# FUNCTION ARGUMENTS
# df: a frame containing weekEnding-source-reportedDeaths
# keyVar: character string, defaults to "reportedDeaths"
# keyYears: years to be included
# firstWeek: character vector for description of first week (NULL means calculate from data)
dfNoNA <- df %>%
rename("numKey"=all_of(keyVar)) %>%
group_by(weekEnding) %>%
summarize(numNA=sum(is.na(numKey)), .groups="drop") %>%
filter(numNA==0) %>%
select(weekEnding) %>%
inner_join(df, by="weekEnding") %>%
rename("numKey"=keyVar)
if (is.null(firstWeek)) firstWeek <- dfNoNA %>% pull(source) %>% min() %>% as.character()
dfRatio <- dfNoNA %>%
arrange(weekEnding, source) %>%
group_by(weekEnding) %>%
mutate(firstNum=first(numKey), pctFirst=numKey/firstNum) %>%
ungroup()
p1 <- dfRatio %>%
filter(lubridate::year(weekEnding) %in% keyYears) %>%
ggplot(aes(x=weekEnding, y=pctFirst)) +
geom_line(aes(group=source, color=factor(source))) +
labs(x="Data reported for week",
y=paste0("Ratio vs. reported in ", firstWeek, " file"),
title="Ratio of CDC all-cause deaths reported by week",
subtitle=paste0("Compared to number by week reported in ", firstWeek, " file")
) +
scale_color_discrete("Download Date")
print(p1)
p2 <- dfRatio %>%
filter(lubridate::year(weekEnding) %in% keyYears) %>%
mutate(weeksPrior=as.integer(max(weekEnding)-weekEnding)/7) %>%
group_by(weekEnding) %>%
filter(row_number()==n()) %>%
ggplot(aes(x=weeksPrior, y=pctFirst)) +
geom_text(aes(label=paste0(round(100*pctFirst), "%")), size=3) +
labs(x=paste0("Weeks prior to latest week reported in ", firstWeek, " file"),
y=paste0("Ratio vs. reported in ", firstWeek, " file"),
title="Ratio of CDC all-cause deaths reported by week",
subtitle=paste0("Compared to number by week reported in ", firstWeek, " file")
)
print(p2)
}
plotCDCRestatement(cdcFourFiles)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(keyVar)` instead of `keyVar` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
It appears that projected deaths in the 23-SEP-2020 file were mostly finalized prior to the most recent four weeks. Inflation factors of ~125%, ~110%, ~105%, and ~105% would have been needed for the current week and the three preceding weeks to match the ‘final’ totals for those weeks as reported in the 2020-DEC-06 file.
The plotting routine has been converted to functional form and re-run. A comparison of percentages is then made using the data from 2020-10-14 as the baseline:
cdcFourFiles %>%
filter(source >= as.Date("2020-10-14")) %>%
plotCDCRestatement()
And the process is repeated, but excluding the 2020-12-06 file:
cdcFourFiles %>%
filter(source < as.Date("2020-12-06")) %>%
plotCDCRestatement()
There is some directional evidence that the extra lag may be less in the 2020-10-14 file than in the 2020-09-23 file. Further exploration of this topic could be interesting.
The latest CDC all-cause death data are downloaded:
# Download new data
cdcLoc <- "Weekly_counts_of_deaths_by_jurisdiction_and_age_group_downloaded_20201213.csv"
cdcList_20201213 <- readRunCDCAllCause(loc=cdcLoc,
startYear=2015,
curYear=2020,
weekThru=40,
startWeek=9,
lst=test_hier5_201130,
epiMap=readFromRDS("epiMonth"),
agePopData=readFromRDS("usPopBucket2020"),
cvDeathThru="2020-10-03",
cdcPlotStartWeek=10,
dlData=!file.exists(paste0("./RInputFiles/Coronavirus/", cdcLoc)),
stateNoCheck=c("NC")
)
## Rows: 181,400
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "A...
## $ `Week Ending Date` <chr> "01/10/2015", "01/17/2015", "01/24/2015", "01/...
## $ `State Abbreviation` <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"...
## $ Year <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015...
## $ Week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ `Age Group` <chr> "25-44 years", "25-44 years", "25-44 years", "...
## $ `Number of Deaths` <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50...
## $ `Time Period` <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2...
## $ Type <chr> "Predicted (weighted)", "Predicted (weighted)"...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 181,400
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <int> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <chr> "25-44 years", "25-44 years", "25-44 years", "25-44 ye...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <chr> "2015-2019", "2015-2019", "2015-2019", "2015-2019", "2...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
## Check Control Levels and Record Counts for Renamed Data:
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
## age n n_deaths_na deaths
## <chr> <int> <int> <dbl>
## 1 25-44 years 27179 9 3331642
## 2 45-64 years 33177 18 13021484
## 3 65-74 years 33168 18 12947006
## 4 75-84 years 33188 20 16091745
## 5 85 years and older 33181 23 20957230
## 6 Under 25 years 21507 0 1429214
## `summarise()` regrouping output by 'period', 'year' (override with `.groups` argument)
## # A tibble: 12 x 6
## # Groups: period, year [6]
## period year type n n_deaths_na deaths
## <chr> <int> <chr> <int> <int> <dbl>
## 1 2015-2019 2015 Predicted (weighted) 15285 0 5416393
## 2 2015-2019 2015 Unweighted 15285 0 5416393
## 3 2015-2019 2016 Predicted (weighted) 15365 0 5483764
## 4 2015-2019 2016 Unweighted 15365 0 5483764
## 5 2015-2019 2017 Predicted (weighted) 15316 0 5643329
## 6 2015-2019 2017 Unweighted 15316 0 5643329
## 7 2015-2019 2018 Predicted (weighted) 15306 0 5698014
## 8 2015-2019 2018 Unweighted 15306 0 5698014
## 9 2015-2019 2019 Predicted (weighted) 15318 0 5725533
## 10 2015-2019 2019 Unweighted 15318 0 5725533
## 11 2020 2020 Predicted (weighted) 14137 49 5976187
## 12 2020 2020 Unweighted 14083 39 5868068
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 3 x 5
## # Groups: period [2]
## period Suppress n n_deaths_na deaths
## <chr> <chr> <int> <int> <dbl>
## 1 2015-2019 <NA> 153180 0 5.59e7
## 2 2020 Suppressed (counts highly incomplete, <5~ 88 88 0.
## 3 2020 <NA> 28132 0 1.18e7
## `summarise()` regrouping output by 'period' (override with `.groups` argument)
## # A tibble: 9 x 5
## # Groups: period [2]
## period Note n n_deaths_na deaths
## <chr> <chr> <int> <int> <dbl>
## 1 2015-20~ <NA> 153180 0 5.59e7
## 2 2020 Data in recent weeks are incomplete. Only~ 23072 0 1.01e7
## 3 2020 Data in recent weeks are incomplete. Only~ 468 0 2.18e5
## 4 2020 Data in recent weeks are incomplete. Only~ 220 26 2.44e4
## 5 2020 Data in recent weeks are incomplete. Only~ 2038 62 4.95e5
## 6 2020 Data in recent weeks are incomplete. Only~ 60 0 3.06e4
## 7 2020 Estimates for Pennsylvania are too low fo~ 48 0 2.26e4
## 8 2020 Weights may be too low to account for und~ 134 0 4.01e4
## 9 2020 <NA> 2180 0 8.98e5
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## state Jurisdiction n n_deaths_na deaths
## 1 US United States 3696 0 33768456
## 2 CA California 3696 0 3207848
## 3 FL Florida 3696 0 2465594
## 4 TX Texas 3696 0 2434435
## 5 PA Pennsylvania 3696 0 1619409
## 6 OH Ohio 3696 0 1461352
## 7 IL Illinois 3696 0 1278684
## 8 NY New York 3696 0 1204510
## 9 MI Michigan 3696 0 1162511
## 10 NC North Carolina 3600 48 1072568
## 11 GA Georgia 3695 0 1012939
## 12 NJ New Jersey 3689 0 902620
## 13 TN Tennessee 3696 0 883980
## 14 VA Virginia 3696 0 810957
## 15 IN Indiana 3694 0 787707
## 16 MO Missouri 3692 0 766234
## 17 AZ Arizona 3696 0 715040
## 18 MA Massachusetts 3658 0 712881
## 19 YC New York City 3691 0 695737
## 20 WA Washington 3695 0 673730
## 21 AL Alabama 3695 0 627446
## 22 WI Wisconsin 3676 0 624424
## 23 MD Maryland 3690 0 596634
## 24 SC South Carolina 3692 0 587910
## 25 KY Kentucky 3654 0 571559
## 26 LA Louisiana 3691 0 551128
## 27 MN Minnesota 3647 0 529039
## 28 CO Colorado 3694 0 468347
## 29 OK Oklahoma 3682 0 466810
## 30 OR Oregon 3523 0 432033
## 31 MS Mississippi 3630 0 381529
## 32 AR Arkansas 3589 0 380474
## 33 CT Connecticut 3244 26 370676
## 34 IA Iowa 3324 0 358019
## 35 PR Puerto Rico 3401 0 346043
## 36 KS Kansas 3382 0 311724
## 37 NV Nevada 3429 0 303343
## 38 WV West Virginia 3119 14 260054
## 39 UT Utah 3581 0 224667
## 40 NM New Mexico 3260 0 214877
## 41 NE Nebraska 2968 0 198537
## 42 ME Maine 2760 0 167532
## 43 ID Idaho 2885 0 160947
## 44 NH New Hampshire 2780 0 141077
## 45 HI Hawaii 2668 0 129851
## 46 RI Rhode Island 2577 0 119134
## 47 MT Montana 2666 0 115998
## 48 DE Delaware 2670 0 103965
## 49 SD South Dakota 2551 0 91836
## 50 ND North Dakota 2537 0 79575
## 51 DC District of Columbia 2652 0 66835
## 52 VT Vermont 2434 0 64666
## 53 WY Wyoming 2420 0 49956
## 54 AK Alaska 2463 0 44484
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## Rows: 181,400
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## Rows: 88,455
## Columns: 11
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-17, 2015-01-24, 2015-01-31, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ age <fct> 25-44 years, 25-44 years, 25-44 years, 25-44 years, 25...
## $ deaths <dbl> 67, 49, 55, 59, 47, 59, 41, 47, 59, 57, 54, 50, 58, 42...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
##
## *** Data suppression checks ***
## # A tibble: 19 x 11
## Jurisdiction weekEnding state year week age deaths period type Suppress
## <chr> <date> <chr> <fct> <int> <fct> <dbl> <fct> <chr> <chr>
## 1 North Carol~ 2020-09-12 NC 2020 37 25-4~ NA 2020 Pred~ Suppres~
## 2 North Carol~ 2020-09-19 NC 2020 38 25-4~ NA 2020 Pred~ Suppres~
## 3 North Carol~ 2020-09-26 NC 2020 39 25-4~ NA 2020 Pred~ Suppres~
## 4 North Carol~ 2020-09-12 NC 2020 37 45-6~ NA 2020 Pred~ Suppres~
## 5 North Carol~ 2020-09-19 NC 2020 38 45-6~ NA 2020 Pred~ Suppres~
## 6 North Carol~ 2020-09-26 NC 2020 39 45-6~ NA 2020 Pred~ Suppres~
## 7 North Carol~ 2020-10-03 NC 2020 40 45-6~ NA 2020 Pred~ Suppres~
## 8 North Carol~ 2020-09-12 NC 2020 37 65-7~ NA 2020 Pred~ Suppres~
## 9 North Carol~ 2020-09-19 NC 2020 38 65-7~ NA 2020 Pred~ Suppres~
## 10 North Carol~ 2020-09-26 NC 2020 39 65-7~ NA 2020 Pred~ Suppres~
## 11 North Carol~ 2020-10-03 NC 2020 40 65-7~ NA 2020 Pred~ Suppres~
## 12 North Carol~ 2020-09-12 NC 2020 37 75-8~ NA 2020 Pred~ Suppres~
## 13 North Carol~ 2020-09-19 NC 2020 38 75-8~ NA 2020 Pred~ Suppres~
## 14 North Carol~ 2020-09-26 NC 2020 39 75-8~ NA 2020 Pred~ Suppres~
## 15 North Carol~ 2020-10-03 NC 2020 40 75-8~ NA 2020 Pred~ Suppres~
## 16 North Carol~ 2020-09-12 NC 2020 37 85 y~ NA 2020 Pred~ Suppres~
## 17 North Carol~ 2020-09-19 NC 2020 38 85 y~ NA 2020 Pred~ Suppres~
## 18 North Carol~ 2020-09-26 NC 2020 39 85 y~ NA 2020 Pred~ Suppres~
## 19 North Carol~ 2020-10-03 NC 2020 40 85 y~ NA 2020 Pred~ Suppres~
## # ... with 1 more variable: Note <chr>
##
## *** Data suppression checks failed - total of 19 suppressions
## *** Of these suppressions, 15 are NOT from weekThru of current year
## Continuing since all states with problems are in stateNoCheck
## `summarise()` regrouping output by 'Jurisdiction', 'weekEnding', 'state', 'year', 'week', 'age', 'period', 'type' (override with `.groups` argument)
## Rows: 83,195
## Columns: 12
## $ Jurisdiction <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ weekEnding <date> 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10, 2015-...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", ...
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, ...
## $ week <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, ...
## $ age <fct> Under 25 years, 25-44 years, 45-64 years, 65-74 years,...
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019,...
## $ type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predi...
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ n <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ deaths <dbl> 25, 67, 253, 202, 272, 320, 28, 49, 256, 222, 253, 332...
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
##
## First duplicate is in row number (0 means no duplicates): 0
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year', 'week' (override with `.groups` argument)
## `summarise()` regrouping output by 'year', 'week' (override with `.groups` argument)
## `summarise()` regrouping output by 'year', 'age', 'week' (override with `.groups` argument)
##
## Plots will be run after excluding stateNoCheck states
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'state', 'quarter', 'month' (override with `.groups` argument)
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = "state"
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
## `summarise()` regrouping output by 'age', 'quarter', 'month' (override with `.groups` argument)
## `summarise()` regrouping output by 'age' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
saveToRDS(cdcList_20201213, ovrWriteError=FALSE)
##
## File already exists: ./RInputFiles/Coronavirus/cdcList_20201213.RDS
##
## Not replacing the existing file since ovrWrite=FALSE
## NULL
New state-level data are downloaded, with existing segments used:
# Use existing segments with updated data
locDownload <- "./RInputFiles/Coronavirus/CV_downloaded_201214.csv"
old_hier5_201214 <- readRunCOVIDTrackingProject(thruLabel="Dec 13, 2020",
downloadTo=if(file.exists(locDownload)) NULL else locDownload,
readFrom=locDownload,
compareFile=readFromRDS("test_hier5_201025")$dfRaw,
useClusters=readFromRDS("test_hier5_201130")$useClusters
)
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## state = col_character(),
## totalTestResultsSource = col_character(),
## dataQualityGrade = col_character(),
## lastUpdateEt = col_character(),
## dateModified = col_datetime(format = ""),
## checkTimeEt = col_character(),
## dateChecked = col_datetime(format = ""),
## fips = col_character(),
## hash = col_character(),
## grade = col_logical()
## )
## i Use `spec()` for the full column specifications.
##
## File is unique by state and date
##
##
## Overall control totals in file:
## # A tibble: 1 x 3
## positiveIncrease deathIncrease hospitalizedCurrently
## <dbl> <dbl> <dbl>
## 1 16339303 292404 12655613
##
## *** COMPARISONS TO REFERENCE FILE: compareFile
##
## Checkin for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checkin for similarity of: states
## In reference but not in current:
## In current but not in reference:
##
## Checkin for similarity of: dates
## In reference but not in current:
## In current but not in reference: 2020-12-14 2020-12-13 2020-12-12 2020-12-11 2020-12-10 2020-12-09 2020-12-08 2020-12-07 2020-12-06 2020-12-05 2020-12-04 2020-12-03 2020-12-02 2020-12-01 2020-11-30 2020-11-29 2020-11-28 2020-11-27 2020-11-26 2020-11-25 2020-11-24 2020-11-23 2020-11-22 2020-11-21 2020-11-20 2020-11-19 2020-11-18 2020-11-17 2020-11-16 2020-11-15 2020-11-14 2020-11-13 2020-11-12 2020-11-11 2020-11-10 2020-11-09 2020-11-08 2020-11-07 2020-11-06 2020-11-05 2020-11-04 2020-11-03 2020-11-02 2020-11-01 2020-10-31 2020-10-30 2020-10-29 2020-10-28 2020-10-27 2020-10-26 2020-10-25
##
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("date", "name")
## date name newValue oldValue
## 1 2020-03-05 positiveIncrease 86 103
## 2 2020-03-06 positiveIncrease 128 109
## 3 2020-03-07 positiveIncrease 129 176
## 4 2020-03-10 positiveIncrease 441 387
## 5 2020-03-11 positiveIncrease 497 509
## 6 2020-03-12 positiveIncrease 745 671
## 7 2020-03-13 positiveIncrease 933 1072
## 8 2020-03-14 positiveIncrease 970 924
## 9 2020-03-15 positiveIncrease 1217 1291
## 10 2020-03-16 positiveIncrease 1847 1739
## 11 2020-03-17 positiveIncrease 2249 2588
## 12 2020-03-18 positiveIncrease 3364 3089
## 13 2020-03-21 hospitalizedCurrently 1492 1436
## 14 2020-03-23 hospitalizedCurrently 2812 2770
## 15 2020-03-24 positiveIncrease 11116 10769
## 16 2020-03-25 positiveIncrease 12590 12908
## 17 2020-03-25 hospitalizedCurrently 5140 5062
## 18 2020-03-28 positiveIncrease 19602 19925
## 19 2020-03-28 deathIncrease 551 544
## 20 2020-03-29 deathIncrease 504 515
## 21 2020-03-30 positiveIncrease 21467 22042
## 22 2020-03-31 positiveIncrease 25187 24853
## 23 2020-03-31 deathIncrease 907 890
## 24 2020-04-01 positiveIncrease 26115 25791
## 25 2020-04-06 positiveIncrease 28425 29002
## 26 2020-04-09 positiveIncrease 35090 34503
## 27 2020-04-10 positiveIncrease 33489 34380
## 28 2020-04-10 deathIncrease 2072 2108
## 29 2020-04-11 positiveIncrease 31105 30501
## 30 2020-04-11 deathIncrease 2079 2054
## 31 2020-04-13 positiveIncrease 24398 25195
## 32 2020-04-14 positiveIncrease 26078 25719
## 33 2020-04-15 positiveIncrease 29859 30307
## 34 2020-04-16 positiveIncrease 31577 30978
## 35 2020-04-23 deathIncrease 1814 1791
## 36 2020-04-24 deathIncrease 1972 1895
## 37 2020-04-25 deathIncrease 1627 1748
## 38 2020-04-27 deathIncrease 1287 1270
## 39 2020-04-29 deathIncrease 2685 2713
## 40 2020-05-01 deathIncrease 1808 1779
## 41 2020-05-02 deathIncrease 1531 1562
## 42 2020-05-05 deathIncrease 2494 2452
## 43 2020-05-06 deathIncrease 1916 1948
## 44 2020-05-07 positiveIncrease 27227 27537
## 45 2020-05-12 positiveIncrease 22558 22890
## 46 2020-05-12 deathIncrease 1506 1486
## 47 2020-05-13 positiveIncrease 21628 21285
## 48 2020-05-13 deathIncrease 1734 1704
## 49 2020-05-14 deathIncrease 1852 1879
## 50 2020-05-15 positiveIncrease 25422 24685
## 51 2020-05-15 deathIncrease 1535 1507
## 52 2020-05-16 positiveIncrease 23586 24702
## 53 2020-05-16 deathIncrease 1237 987
## 54 2020-05-17 deathIncrease 873 849
## 55 2020-05-21 deathIncrease 1377 1394
## 56 2020-05-22 positiveIncrease 24173 24433
## 57 2020-05-22 deathIncrease 1291 1341
## 58 2020-05-23 positiveIncrease 22365 21531
## 59 2020-05-23 deathIncrease 1038 1063
## 60 2020-05-24 positiveIncrease 18860 20072
## 61 2020-05-24 deathIncrease 689 680
## 62 2020-05-26 deathIncrease 665 645
## 63 2020-05-27 deathIncrease 1335 1321
## 64 2020-05-29 deathIncrease 1171 1184
## 65 2020-05-30 positiveIncrease 23437 23682
## 66 2020-06-01 deathIncrease 680 668
## 67 2020-06-02 deathIncrease 973 962
## 68 2020-06-03 positiveIncrease 20155 20390
## 69 2020-06-03 deathIncrease 974 993
## 70 2020-06-04 positiveIncrease 20383 20886
## 71 2020-06-04 deathIncrease 881 893
## 72 2020-06-05 positiveIncrease 23066 23394
## 73 2020-06-05 deathIncrease 837 826
## 74 2020-06-06 positiveIncrease 22558 23064
## 75 2020-06-06 deathIncrease 714 728
## 76 2020-06-08 deathIncrease 674 661
## 77 2020-06-09 deathIncrease 891 902
## 78 2020-06-12 positiveIncrease 23096 23597
## 79 2020-06-12 deathIncrease 766 775
## 80 2020-06-15 deathIncrease 387 381
## 81 2020-06-16 deathIncrease 718 730
## 82 2020-06-17 deathIncrease 779 767
## 83 2020-06-18 positiveIncrease 27089 27746
## 84 2020-06-18 deathIncrease 685 705
## 85 2020-06-19 positiveIncrease 30959 31471
## 86 2020-06-20 positiveIncrease 31951 32294
## 87 2020-06-20 deathIncrease 615 629
## 88 2020-06-21 positiveIncrease 28848 27928
## 89 2020-06-23 positiveIncrease 33885 33447
## 90 2020-06-23 deathIncrease 722 710
## 91 2020-06-24 deathIncrease 707 724
## 92 2020-06-26 deathIncrease 621 637
## 93 2020-06-27 deathIncrease 503 511
## 94 2020-06-29 deathIncrease 338 332
## 95 2020-06-30 deathIncrease 580 596
## 96 2020-07-02 positiveIncrease 53508 54085
## 97 2020-07-04 deathIncrease 300 306
## 98 2020-07-06 positiveIncrease 41494 41959
## 99 2020-07-06 deathIncrease 235 243
## 100 2020-07-07 deathIncrease 910 923
## 101 2020-07-08 deathIncrease 818 807
## 102 2020-07-10 deathIncrease 835 854
## 103 2020-07-14 positiveIncrease 59250 62687
## 104 2020-07-14 deathIncrease 745 736
## 105 2020-07-15 positiveIncrease 69101 65797
## 106 2020-07-20 deathIncrease 375 363
## 107 2020-07-22 deathIncrease 1142 1171
## 108 2020-07-23 deathIncrease 1074 1056
## 109 2020-07-25 deathIncrease 1009 1023
## 110 2020-07-26 positiveIncrease 60123 61000
## 111 2020-07-30 deathIncrease 1245 1259
## 112 2020-07-31 deathIncrease 1329 1312
## 113 2020-08-01 positiveIncrease 60245 61101
## 114 2020-08-02 positiveIncrease 52737 46812
## 115 2020-08-03 positiveIncrease 43122 49713
## 116 2020-08-06 deathIncrease 1237 1251
## 117 2020-08-08 positiveIncrease 53083 53712
## 118 2020-08-14 positiveIncrease 57093 55636
## 119 2020-08-16 positiveIncrease 41782 42487
## 120 2020-08-20 deathIncrease 1122 1134
## 121 2020-08-22 positiveIncrease 45722 46236
## 122 2020-08-24 positiveIncrease 34249 34643
## 123 2020-08-29 positiveIncrease 43962 44501
## 124 2020-08-31 deathIncrease 377 366
## 125 2020-09-02 positiveIncrease 30216 30603
## 126 2020-09-07 positiveIncrease 28142 28682
## 127 2020-09-08 deathIncrease 350 358
## 128 2020-09-10 deathIncrease 1156 1170
## 129 2020-09-12 deathIncrease 821 810
## 130 2020-09-15 positiveIncrease 34945 35445
## 131 2020-09-16 deathIncrease 1184 1200
## 132 2020-09-17 deathIncrease 878 863
## 133 2020-09-19 positiveIncrease 44906 45564
## 134 2020-09-19 deathIncrease 751 740
## 135 2020-09-20 positiveIncrease 35504 36295
## 136 2020-09-24 deathIncrease 933 921
## 137 2020-09-27 positiveIncrease 34983 35454
## 138 2020-09-28 positiveIncrease 35362 36524
## 139 2020-09-28 deathIncrease 245 257
## 140 2020-10-02 deathIncrease 844 835
## 141 2020-10-04 positiveIncrease 37982 38439
## 142 2020-10-04 deathIncrease 373 363
## 143 2020-10-06 deathIncrease 622 634
## 144 2020-10-09 deathIncrease 905 893
## 145 2020-10-10 deathIncrease 690 665
## 146 2020-10-11 positiveIncrease 46268 46946
## 147 2020-10-12 positiveIncrease 42645 43124
## 148 2020-10-13 deathIncrease 724 690
## 149 2020-10-14 deathIncrease 798 811
## 150 2020-10-15 deathIncrease 928 951
## 151 2020-10-16 deathIncrease 894 877
## 152 2020-10-17 positiveIncrease 57355 57943
## 153 2020-10-18 positiveIncrease 48280 48922
## 154 2020-10-18 deathIncrease 402 393
## 155 2020-10-19 deathIncrease 451 456
## 156 2020-10-21 positiveIncrease 60980 58606
## 157 2020-10-22 positiveIncrease 73003 75248
## 158 2020-10-22 deathIncrease 1126 1143
## 159 2020-10-23 deathIncrease 941 916
## 160 2020-10-24 deathIncrease 897 885
## Joining, by = c("date", "name")
## Warning: Removed 51 row(s) containing missing values (geom_path).
##
##
## *** Difference of at least 5 and difference is at least 1%:
## Joining, by = c("state", "name")
## state name newValue oldValue
## 1 AK positiveIncrease 12523 13535
## 2 CO positiveIncrease 93398 91570
## 3 CO deathIncrease 2218 2076
## 4 FL positiveIncrease 766305 776249
## 5 ND deathIncrease 453 345
## 6 NM positiveIncrease 41040 40168
## 7 NM hospitalizedCurrently 27399 27120
## 8 PR positiveIncrease 31067 61275
## 9 RI positiveIncrease 30581 30116
## 10 WA hospitalizedCurrently 92643 69716
## Rows: 16,082
## Columns: 55
## $ date <date> 2020-12-14, 2020-12-14, 2020-12-14, 20...
## $ state <chr> "AK", "AL", "AR", "AS", "AZ", "CA", "CO...
## $ positive <dbl> 40160, 297895, 187507, 0, 420248, 15850...
## $ probableCases <dbl> NA, 53133, 26701, NA, 15954, NA, 11826,...
## $ negative <dbl> 1107400, 1478907, 1705843, 2140, 212692...
## $ pending <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestResultsSource <chr> "totalTestsViral", "totalTestsPeopleVir...
## $ totalTestResults <dbl> 1147560, 1723669, 1866199, 2140, 434469...
## $ hospitalizedCurrently <dbl> 140, 2286, 1050, NA, 3677, 14578, 1585,...
## $ hospitalizedCumulative <dbl> 889, 28913, 9991, NA, 31142, NA, 16174,...
## $ inIcuCurrently <dbl> NA, NA, 372, NA, 829, 3078, NA, NA, 63,...
## $ inIcuCumulative <dbl> NA, 2363, NA, NA, NA, NA, NA, NA, NA, N...
## $ onVentilatorCurrently <dbl> 12, NA, 180, NA, 542, NA, NA, NA, 34, N...
## $ onVentilatorCumulative <dbl> NA, 1353, 1095, NA, NA, NA, NA, NA, NA,...
## $ recovered <dbl> 7165, 174805, 163351, NA, 62118, NA, 15...
## $ dataQualityGrade <chr> "A", "A", "A+", "D", "A+", "B", "A", "C...
## $ lastUpdateEt <chr> "12/14/2020 03:59", "12/14/2020 11:00",...
## $ dateModified <dttm> 2020-12-14 03:59:00, 2020-12-14 11:00:...
## $ checkTimeEt <chr> "12/13 22:59", "12/14 06:00", "12/13 19...
## $ death <dbl> 176, 4102, 2990, 0, 7358, 21046, 3969, ...
## $ hospitalized <dbl> 889, 28913, 9991, NA, 31142, NA, 16174,...
## $ dateChecked <dttm> 2020-12-14 03:59:00, 2020-12-14 11:00:...
## $ totalTestsViral <dbl> 1147560, NA, 1866199, 2140, 4344693, 27...
## $ positiveTestsViral <dbl> 47796, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ negativeTestsViral <dbl> 1098479, NA, 1705843, NA, NA, NA, NA, N...
## $ positiveCasesViral <dbl> NA, 244762, 160356, 0, 404294, 1585044,...
## $ deathConfirmed <dbl> 176, 3624, 2656, NA, 6782, NA, 3398, 43...
## $ deathProbable <dbl> NA, 478, 334, NA, 576, NA, 571, 1047, N...
## $ totalTestEncountersViral <dbl> NA, NA, NA, NA, NA, NA, 3809889, NA, 78...
## $ totalTestsPeopleViral <dbl> NA, 1723669, NA, NA, 2531222, NA, 19688...
## $ totalTestsAntibody <dbl> NA, NA, NA, NA, 377958, NA, 235882, NA,...
## $ positiveTestsAntibody <dbl> NA, NA, NA, NA, NA, NA, 22560, NA, NA, ...
## $ negativeTestsAntibody <dbl> NA, NA, NA, NA, NA, NA, 212548, NA, NA,...
## $ totalTestsPeopleAntibody <dbl> NA, 78262, NA, NA, NA, NA, NA, NA, NA, ...
## $ positiveTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ negativeTestsPeopleAntibody <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ totalTestsPeopleAntigen <dbl> NA, NA, 179158, NA, NA, NA, NA, NA, NA,...
## $ positiveTestsPeopleAntigen <dbl> NA, NA, 32616, NA, NA, NA, NA, NA, NA, ...
## $ totalTestsAntigen <dbl> NA, NA, 21856, NA, NA, NA, NA, 49816, N...
## $ positiveTestsAntigen <dbl> NA, NA, 3300, NA, NA, NA, NA, NA, NA, N...
## $ fips <chr> "02", "01", "05", "60", "04", "06", "08...
## $ positiveIncrease <dbl> 422, 2264, 1805, 0, 11806, 33278, 2911,...
## $ negativeIncrease <dbl> 4040, 20347, 9262, 0, 7223, 323174, 103...
## $ total <dbl> 1147560, 1776802, 1893350, 2140, 254717...
## $ totalTestResultsIncrease <dbl> 4462, 27230, 10495, 0, 38805, 356452, 3...
## $ posNeg <dbl> 1147560, 1776802, 1893350, 2140, 254717...
## $ deathIncrease <dbl> 0, 0, 45, 0, 1, 77, 11, 81, 1, 0, 138, ...
## $ hospitalizedIncrease <dbl> 3, 767, 64, 0, 193, 0, 48, 0, 0, 0, 142...
## $ hash <chr> "58baf833f72d7115b62a7e4dd1ab3545263286...
## $ commercialScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeRegularScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ negativeScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ positiveScore <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ score <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ grade <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
##
##
## Control totals - note that validState other than TRUE will be discarded
##
## # A tibble: 2 x 6
## validState cases deaths hosp tests n
## <lgl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 FALSE 72033 1426 NA 510886 1370
## 2 TRUE 16267270 290978 NA 220582479 14712
## Rows: 14,712
## Columns: 6
## $ date <date> 2020-12-14, 2020-12-14, 2020-12-14, 2020-12-14, 2020-12-14,...
## $ state <chr> "AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", ...
## $ cases <dbl> 422, 2264, 1805, 11806, 33278, 2911, 7231, 164, 997, 8343, 3...
## $ deaths <dbl> 0, 0, 45, 1, 77, 11, 81, 1, 0, 138, 28, 0, 60, 6, 116, 35, 3...
## $ hosp <dbl> 140, 2286, 1050, 3677, 14578, 1585, 1243, 239, 373, 4932, 33...
## $ tests <dbl> 4462, 27230, 10495, 38805, 356452, 36588, 119244, 4714, 9931...
## Rows: 14,712
## Columns: 14
## $ date <date> 2020-01-22, 2020-01-22, 2020-01-23, 2020-01-23, 2020-01-24,...
## $ state <chr> "MA", "WA", "MA", "WA", "MA", "WA", "MA", "WA", "MA", "WA", ...
## $ cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hosp <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tests <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ cpm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ dpm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ hpm <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm <dbl> 0.0000000, 0.0000000, 0.1471796, 0.0000000, 0.0000000, 0.000...
## $ cpm7 <dbl> NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, NA...
## $ dpm7 <dbl> NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, NA...
## $ hpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ tpm7 <dbl> NA, NA, NA, NA, NA, NA, 0.04205130, 0.00000000, 0.06307695, ...
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'date', 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
##
## Recency is defined as 2020-11-15 through current
##
## Recency is defined as 2020-11-15 through current
## Warning: Removed 4 row(s) containing missing values (geom_path).
## Warning: Removed 4 row(s) containing missing values (geom_path).
## `summarise()` regrouping output by 'state', 'cluster', 'date' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
saveToRDS(old_hier5_201214, ovrWriteError=FALSE)
##
## File already exists: ./RInputFiles/Coronavirus/old_hier5_201214.RDS
##
## Not replacing the existing file since ovrWrite=FALSE
## NULL
New county-level data are downloaded, with existing segments used:
# Locations for the population, case, and death file
popLoc <- "./RInputFiles/Coronavirus/covid_county_population_usafacts.csv"
caseLoc <- "./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20201215.csv"
deathLoc <- "./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201215.csv"
# Run old segments against new data
cty_old_20201215 <- readRunUSAFacts(maxDate="2020-12-13",
popLoc=popLoc,
caseLoc=caseLoc,
deathLoc=deathLoc,
dlCaseDeath=!(file.exists(caseLoc) & file.exists(deathLoc)),
oldFile=readFromRDS("cty_20201026")$dfBurden,
existingCountyClusters=readFromRDS("cty_new_20201203")$clustVec
)
##
## -- Column specification --------------------------------------------------------
## cols(
## countyFIPS = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## population = col_double()
## )
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## i Use `spec()` for the full column specifications.
## Rows: 1,044,765
## Columns: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumCases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## Warning: Missing column names filled in: 'X332' [332]
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## X332 = col_logical()
## )
## i Use `spec()` for the full column specifications.
## Warning: 1 parsing failure.
## row col expected actual file
## 3196 X332 1/0/T/F/TRUE/FALSE './RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201215.csv'
## Warning: Problem with `mutate()` input `date`.
## i 3196 failed to parse.
## i Input `date` is `lubridate::mdy(date)`.
## Warning: 3196 failed to parse.
## Rows: 1,048,288
## Columns: 6
## $ countyFIPS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "State...
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A...
## $ stateFIPS <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ date <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01...
## $ cumDeaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## `geom_smooth()` using formula 'y ~ x'
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
##
## Shapes will be created without any floor on the number of cases per million
## Shapes will be created without any floor on the number of deaths per million
## *** Counties with 0 cases/deaths or that fall below the floor for minCase/minDeath ***
## # A tibble: 1 x 4
## cpm_mean_is0 dpm_mean_is0 dpm_mean_ltDeath cpm_mean_ltCase
## <dbl> <dbl> <dbl> <dbl>
## 1 0 0.00251 0 0
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'date', 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
##
## Recency is defined as 2020-11-14 through current
##
## Recency is defined as 2020-11-14 through current
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
## Joining, by = "fipsCounty"
## Joining, by = "fipsCounty"
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
saveToRDS(cty_old_20201215, ovrWriteError=FALSE)
##
## File already exists: ./RInputFiles/Coronavirus/cty_old_20201215.RDS
##
## Not replacing the existing file since ovrWrite=FALSE
## NULL
Comparisons are made between the state-level totals as reflected in COVID Tracking Project and USA Facts:
usaFacts <- cty_old_20201215$clusterStateData %>%
filter(date <= as.Date("2020-12-10")) %>%
group_by(state) %>%
summarize(cases=sum(cases, na.rm=TRUE), deaths=sum(deaths), .groups="drop")
ctp <- old_hier5_201214$plotData %>%
filter(date <= as.Date("2020-12-10")) %>%
group_by(state) %>%
summarize(cases=sum(cases), deaths=sum(deaths), .groups="drop")
usaCTP <- usaFacts %>%
bind_rows(ctp, .id="source") %>%
mutate(source=c('1'="USA Facts", '2'="COVID Tracking Project")[source]) %>%
pivot_longer(-c(source, state), names_to="metric", values_to="value")
# Plot percentage difference by metric and state
usaCTP %>%
pivot_wider(c(state, metric), names_from="source", values_from="value") %>%
mutate(pct=`COVID Tracking Project`/`USA Facts`) %>%
ggplot(aes(x=fct_reorder(state, abs(pct-1), .fun=max), y=pct)) +
geom_point() +
coord_flip() +
labs(x="",
y="COVID Tracking Project as % of USA Facts",
title="Comparison of COVID Tracking Project and USA Facts",
subtitle="Data as of 10-DEC-2020"
) +
geom_hline(yintercept=1, lty=2) +
facet_wrap(~metric)
# Plot percentage difference by metric and state
usaCTP %>%
pivot_wider(c(state, metric), names_from="source", values_from="value") %>%
mutate(pct=`COVID Tracking Project`/`USA Facts`) %>%
group_by(state) %>%
filter(max(abs(pct-1))>=0.025) %>%
ggplot(aes(x=fct_reorder(state, abs(pct-1), .fun=max), y=pct)) +
geom_text(aes(label=paste0(round(100*pct, 1), "%")), size=3) +
coord_flip() +
labs(x="",
y="COVID Tracking Project as % of USA Facts",
title="Comparison of COVID Tracking Project and USA Facts",
subtitle="Data as of 10-DEC-2020 (filtered to only states at least 2.5% different on one metric)"
) +
geom_hline(yintercept=1, lty=2) +
facet_wrap(~metric)
It may be particularly interesting to explore some of the larger differences:
Virginia is particularly of note for being ~25% different on both cases and deaths. Iowa is particularly of note for being ~15% different on both cases and deaths, but in different directions.